logicaffeine_language/
lexer.rs

1//! Two-stage lexer for LOGOS natural language input.
2//!
3//! The lexer transforms natural language text into a token stream suitable
4//! for parsing. It operates in two stages:
5//!
6//! ## Stage 1: Line Lexer
7//!
8//! The [`LineLexer`] handles structural concerns:
9//!
10//! - **Indentation**: Tracks indent levels, emits `Indent`/`Dedent` tokens
11//! - **Block boundaries**: Identifies significant whitespace
12//! - **Content extraction**: Passes line content to Stage 2
13//!
14//! ## Stage 2: Word Lexer
15//!
16//! The [`Lexer`] performs word-level tokenization:
17//!
18//! - **Vocabulary lookup**: Identifies words via the lexicon database
19//! - **Morphological analysis**: Handles inflection (verb tenses, plurals)
20//! - **Ambiguity resolution**: Uses priority rules for ambiguous words
21//!
22//! ## Ambiguity Rules
23//!
24//! When a word matches multiple lexicon entries, priority determines the token:
25//!
26//! 1. **Quantifiers** over nouns ("some" → Quantifier, not Noun)
27//! 2. **Determiners** over adjectives ("the" → Determiner, not Adjective)
28//! 3. **Verbs** over nouns for -ing/-ed forms ("running" → Verb)
29//!
30//! ## Example
31//!
32//! ```text
33//! Input:  "Every cat sleeps."
34//! Output: [Quantifier("every"), Noun("cat"), Verb("sleeps"), Period]
35//! ```
36
37use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, CalendarUnit, FocusKind, MeasureKind, Span, Token, TokenType};
40
41// ============================================================================
42// Stage 1: Line Lexer (Spec §2.5.2)
43// ============================================================================
44
45/// Tokens emitted by the LineLexer (Stage 1).
46/// Handles structural tokens (Indent, Dedent, Newline) while treating
47/// all other content as opaque for Stage 2 word classification.
48#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50    /// Block increased indentation
51    Indent,
52    /// Block decreased indentation
53    Dedent,
54    /// Logical newline (statement boundary) - reserved for future use
55    Newline,
56    /// Content to be further tokenized (line content, trimmed)
57    Content { text: String, start: usize, end: usize },
58}
59
60/// Stage 1 Lexer: Handles only lines, indentation, and structural tokens.
61/// Treats all other text as opaque `Content` for the Stage 2 WordLexer.
62pub struct LineLexer<'a> {
63    source: &'a str,
64    bytes: &'a [u8],
65    indent_stack: Vec<usize>,
66    pending_dedents: usize,
67    position: usize,
68    /// True if we need to emit Content for current line
69    has_pending_content: bool,
70    pending_content_start: usize,
71    pending_content_end: usize,
72    pending_content_text: String,
73    /// True after we've finished processing all lines
74    finished_lines: bool,
75    /// True if we've emitted at least one Indent (need to emit Dedents at EOF)
76    emitted_indent: bool,
77    /// Escape block body byte ranges to skip (start_byte, end_byte)
78    escape_body_ranges: Vec<(usize, usize)>,
79}
80
81impl<'a> LineLexer<'a> {
82    pub fn new(source: &'a str) -> Self {
83        Self {
84            source,
85            bytes: source.as_bytes(),
86            indent_stack: vec![0],
87            pending_dedents: 0,
88            position: 0,
89            has_pending_content: false,
90            pending_content_start: 0,
91            pending_content_end: 0,
92            pending_content_text: String::new(),
93            finished_lines: false,
94            emitted_indent: false,
95            escape_body_ranges: Vec::new(),
96        }
97    }
98
99    pub fn with_escape_ranges(source: &'a str, escape_body_ranges: Vec<(usize, usize)>) -> Self {
100        Self {
101            source,
102            bytes: source.as_bytes(),
103            indent_stack: vec![0],
104            pending_dedents: 0,
105            position: 0,
106            has_pending_content: false,
107            pending_content_start: 0,
108            pending_content_end: 0,
109            pending_content_text: String::new(),
110            finished_lines: false,
111            emitted_indent: false,
112            escape_body_ranges,
113        }
114    }
115
116    /// Check if a byte position falls within an escape body range.
117    fn is_in_escape_body(&self, pos: usize) -> bool {
118        self.escape_body_ranges.iter().any(|(start, end)| pos >= *start && pos < *end)
119    }
120
121    /// Calculate indentation level at current position (at start of line).
122    /// Returns (indent_level, content_start_pos).
123    fn measure_indent(&self, line_start: usize) -> (usize, usize) {
124        let mut indent = 0;
125        let mut pos = line_start;
126
127        while pos < self.bytes.len() {
128            match self.bytes[pos] {
129                b' ' => {
130                    indent += 1;
131                    pos += 1;
132                }
133                b'\t' => {
134                    indent += 4; // Tab = 4 spaces
135                    pos += 1;
136                }
137                _ => break,
138            }
139        }
140
141        (indent, pos)
142    }
143
144    /// Read content from current position until end of line or EOF.
145    /// Returns (content_text, content_start, content_end, next_line_start).
146    fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
147        let mut pos = content_start;
148
149        // Find end of line
150        while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
151            pos += 1;
152        }
153
154        let content_end = pos;
155        let text = self.source[content_start..content_end].trim_end().to_string();
156
157        // Move past newline if present
158        let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
159            pos + 1
160        } else {
161            pos
162        };
163
164        (text, content_start, content_end, next_line_start)
165    }
166
167    /// Check if the line starting at `pos` is blank (only whitespace).
168    fn is_blank_line(&self, line_start: usize) -> bool {
169        let mut pos = line_start;
170        while pos < self.bytes.len() {
171            match self.bytes[pos] {
172                b' ' | b'\t' => pos += 1,
173                b'\n' => return true,
174                _ => return false,
175            }
176        }
177        true // EOF counts as blank
178    }
179
180    /// Process the next line and update internal state.
181    /// Returns true if we have tokens to emit, false if we're done.
182    fn process_next_line(&mut self) -> bool {
183        // Skip blank lines
184        while self.position < self.bytes.len() && self.is_blank_line(self.position) {
185            // Skip to next line
186            while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
187                self.position += 1;
188            }
189            if self.position < self.bytes.len() {
190                self.position += 1; // Skip the newline
191            }
192        }
193
194        // Check if we've reached EOF
195        if self.position >= self.bytes.len() {
196            self.finished_lines = true;
197            // Emit remaining dedents at EOF
198            if self.indent_stack.len() > 1 {
199                self.pending_dedents = self.indent_stack.len() - 1;
200                self.indent_stack.truncate(1);
201            }
202            return self.pending_dedents > 0;
203        }
204
205        // Measure indentation of current line
206        let (line_indent, content_start) = self.measure_indent(self.position);
207
208        // Read line content
209        let (text, start, end, next_pos) = self.read_line_content(content_start);
210
211        // Skip if content is empty (shouldn't happen after blank line skip, but be safe)
212        if text.is_empty() {
213            self.position = next_pos;
214            return self.process_next_line();
215        }
216
217        let current_indent = *self.indent_stack.last().unwrap();
218
219        // Handle indentation changes
220        if line_indent > current_indent {
221            // Indent: push new level
222            self.indent_stack.push(line_indent);
223            self.emitted_indent = true;
224            // Store content to emit after Indent
225            self.has_pending_content = true;
226            self.pending_content_text = text;
227            self.pending_content_start = start;
228            self.pending_content_end = end;
229            self.position = next_pos;
230            // We'll emit Indent first, then Content
231            return true;
232        } else if line_indent < current_indent {
233            // Dedent: pop until we match
234            while self.indent_stack.len() > 1 {
235                let top = *self.indent_stack.last().unwrap();
236                if line_indent < top {
237                    self.indent_stack.pop();
238                    self.pending_dedents += 1;
239                } else {
240                    break;
241                }
242            }
243            // Store content to emit after Dedents
244            self.has_pending_content = true;
245            self.pending_content_text = text;
246            self.pending_content_start = start;
247            self.pending_content_end = end;
248            self.position = next_pos;
249            return true;
250        } else {
251            // Same indentation level
252            self.has_pending_content = true;
253            self.pending_content_text = text;
254            self.pending_content_start = start;
255            self.pending_content_end = end;
256            self.position = next_pos;
257            return true;
258        }
259    }
260}
261
262impl<'a> Iterator for LineLexer<'a> {
263    type Item = LineToken;
264
265    fn next(&mut self) -> Option<LineToken> {
266        // 1. Emit pending dedents first
267        if self.pending_dedents > 0 {
268            self.pending_dedents -= 1;
269            return Some(LineToken::Dedent);
270        }
271
272        // 2. Emit pending content
273        if self.has_pending_content {
274            self.has_pending_content = false;
275            let text = std::mem::take(&mut self.pending_content_text);
276            let start = self.pending_content_start;
277            let end = self.pending_content_end;
278            return Some(LineToken::Content { text, start, end });
279        }
280
281        // 3. Check if we need to emit Indent (after pushing to stack)
282        // This happens when we detected an indent but haven't emitted the token yet
283        // We need to check if indent_stack was just modified
284
285        // 4. Process next line
286        if !self.finished_lines {
287            let had_indent = self.indent_stack.len();
288            if self.process_next_line() {
289                // Check if we added an indent level
290                if self.indent_stack.len() > had_indent {
291                    return Some(LineToken::Indent);
292                }
293                // Check if we have pending dedents
294                if self.pending_dedents > 0 {
295                    self.pending_dedents -= 1;
296                    return Some(LineToken::Dedent);
297                }
298                // Otherwise emit content
299                if self.has_pending_content {
300                    self.has_pending_content = false;
301                    let text = std::mem::take(&mut self.pending_content_text);
302                    let start = self.pending_content_start;
303                    let end = self.pending_content_end;
304                    return Some(LineToken::Content { text, start, end });
305                }
306            } else if self.pending_dedents > 0 {
307                // EOF with pending dedents
308                self.pending_dedents -= 1;
309                return Some(LineToken::Dedent);
310            }
311        }
312
313        // 5. Emit any remaining dedents at EOF
314        if self.pending_dedents > 0 {
315            self.pending_dedents -= 1;
316            return Some(LineToken::Dedent);
317        }
318
319        None
320    }
321}
322
323// ============================================================================
324// Stage 2: Word Lexer (existing Lexer)
325// ============================================================================
326
327#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
328pub enum LexerMode {
329    #[default]
330    Declarative, // Logic, Theorems, Definitions
331    Imperative,  // Main, Functions, Code
332}
333
334pub struct Lexer<'a> {
335    words: Vec<WordItem>,
336    pos: usize,
337    lexicon: Lexicon,
338    interner: &'a mut Interner,
339    input_len: usize,
340    in_let_context: bool,
341    mode: LexerMode,
342    source: String,
343    /// Escape block body byte ranges: (skip_start, skip_end) for filtering LineLexer events
344    escape_body_ranges: Vec<(usize, usize)>,
345}
346
347struct WordItem {
348    word: String,
349    trailing_punct: Option<char>,
350    start: usize,
351    end: usize,
352    punct_pos: Option<usize>,
353}
354
355impl<'a> Lexer<'a> {
356    /// Creates a new lexer for the given input text.
357    ///
358    /// The lexer will tokenize natural language text according to the
359    /// lexicon database, performing morphological analysis and ambiguity
360    /// resolution.
361    ///
362    /// # Arguments
363    ///
364    /// * `input` - The natural language text to tokenize
365    /// * `interner` - String interner for efficient symbol handling
366    ///
367    /// # Example
368    ///
369    /// ```
370    /// use logicaffeine_language::lexer::Lexer;
371    /// use logicaffeine_base::Interner;
372    ///
373    /// let mut interner = Interner::new();
374    /// let mut lexer = Lexer::new("Every cat sleeps.", &mut interner);
375    /// let tokens = lexer.tokenize();
376    ///
377    /// assert_eq!(tokens.len(), 5); // Quantifier, Noun, Verb, Period, EOI
378    /// ```
379    pub fn new(input: &str, interner: &'a mut Interner) -> Self {
380        let escape_ranges = Self::find_escape_block_ranges(input);
381        let escape_body_ranges: Vec<(usize, usize)> = escape_ranges.iter()
382            .map(|(_, end, content_start, _)| (*content_start, *end))
383            .collect();
384        let words = Self::split_into_words(input, &escape_ranges);
385        let input_len = input.len();
386
387        Lexer {
388            words,
389            pos: 0,
390            lexicon: Lexicon::new(),
391            interner,
392            input_len,
393            in_let_context: false,
394            mode: LexerMode::Declarative,
395            source: input.to_string(),
396            escape_body_ranges,
397        }
398    }
399
400    /// Pre-scan source text for escape block bodies.
401    /// Returns (skip_start_byte, skip_end_byte, content_start_byte, raw_code) tuples.
402    /// `skip_start` is the line start (for byte skipping in split_into_words).
403    /// `content_start` is after leading whitespace (for token span alignment with Indent events).
404    fn find_escape_block_ranges(source: &str) -> Vec<(usize, usize, usize, String)> {
405        let mut ranges = Vec::new();
406        let lines: Vec<&str> = source.split('\n').collect();
407        let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len());
408        let mut pos = 0;
409        for line in &lines {
410            line_starts.push(pos);
411            pos += line.len() + 1; // +1 for the newline
412        }
413
414        let mut i = 0;
415        while i < lines.len() {
416            let trimmed = lines[i].trim();
417            // Check if this line contains an escape header: "Escape to Rust:"
418            // Matches both statement position (whole line) and expression position
419            // (e.g., "Let x: Int be Escape to Rust:")
420            let lower = trimmed.to_lowercase();
421            if lower == "escape to rust:" ||
422               lower.ends_with(" escape to rust:") ||
423               (lower.starts_with("escape to ") && lower.ends_with(':'))
424            {
425                // Find the body: subsequent lines with deeper indentation
426                let header_indent = Self::measure_indent_static(lines[i]);
427                i += 1;
428
429                // Skip blank lines to find the first body line
430                let mut body_start_line = i;
431                while body_start_line < lines.len() && lines[body_start_line].trim().is_empty() {
432                    body_start_line += 1;
433                }
434
435                if body_start_line >= lines.len() {
436                    // No body found
437                    continue;
438                }
439
440                let base_indent = Self::measure_indent_static(lines[body_start_line]);
441                if base_indent <= header_indent {
442                    // No indented body
443                    continue;
444                }
445
446                // Capture all lines at base_indent or deeper
447                let body_byte_start = line_starts[body_start_line];
448                let mut body_end_line = body_start_line;
449                let mut code_lines: Vec<String> = Vec::new();
450
451                let mut j = body_start_line;
452                while j < lines.len() {
453                    let line = lines[j];
454                    if line.trim().is_empty() {
455                        // Blank lines are preserved
456                        code_lines.push(String::new());
457                        body_end_line = j;
458                        j += 1;
459                        continue;
460                    }
461                    let line_indent = Self::measure_indent_static(line);
462                    if line_indent < base_indent {
463                        break;
464                    }
465                    // Strip base indentation
466                    let stripped = Self::strip_indent(line, base_indent);
467                    code_lines.push(stripped);
468                    body_end_line = j;
469                    j += 1;
470                }
471
472                // Trim trailing empty lines from code
473                while code_lines.last().map_or(false, |l| l.is_empty()) {
474                    code_lines.pop();
475                }
476
477                if !code_lines.is_empty() {
478                    let body_byte_end = if body_end_line + 1 < lines.len() {
479                        line_starts[body_end_line + 1]
480                    } else {
481                        source.len()
482                    };
483                    // Compute content start (after leading whitespace of first body line)
484                    let content_start = body_byte_start + Self::leading_whitespace_bytes(lines[body_start_line]);
485                    let raw_code = code_lines.join("\n");
486                    ranges.push((body_byte_start, body_byte_end, content_start, raw_code));
487                }
488
489                i = j;
490            } else {
491                i += 1;
492            }
493        }
494
495        ranges
496    }
497
498    /// Count leading whitespace bytes in a line.
499    fn leading_whitespace_bytes(line: &str) -> usize {
500        let mut count = 0;
501        for c in line.chars() {
502            match c {
503                ' ' | '\t' => count += c.len_utf8(),
504                _ => break,
505            }
506        }
507        count
508    }
509
510    /// Measure indent of a line (static helper for pre-scan).
511    fn measure_indent_static(line: &str) -> usize {
512        let mut indent = 0;
513        for c in line.chars() {
514            match c {
515                ' ' => indent += 1,
516                '\t' => indent += 4,
517                _ => break,
518            }
519        }
520        indent
521    }
522
523    /// Strip `count` leading spaces/tabs from a line.
524    fn strip_indent(line: &str, count: usize) -> String {
525        let mut stripped = 0;
526        let mut byte_pos = 0;
527        for (i, c) in line.char_indices() {
528            if stripped >= count {
529                byte_pos = i;
530                break;
531            }
532            match c {
533                ' ' => { stripped += 1; byte_pos = i + 1; }
534                '\t' => { stripped += 4; byte_pos = i + 1; }
535                _ => { byte_pos = i; break; }
536            }
537        }
538        if stripped < count {
539            byte_pos = line.len();
540        }
541        line[byte_pos..].to_string()
542    }
543
544    fn split_into_words(input: &str, escape_ranges: &[(usize, usize, usize, String)]) -> Vec<WordItem> {
545        let mut items = Vec::new();
546        let mut current_word = String::new();
547        let mut word_start = 0;
548        let chars: Vec<char> = input.chars().collect();
549        let mut char_idx = 0;
550        let mut skip_count = 0;
551        // Track byte offset for escape range matching
552        let mut skip_to_byte: Option<usize> = None;
553
554        for (i, c) in input.char_indices() {
555            if skip_count > 0 {
556                skip_count -= 1;
557                char_idx += 1;
558                continue;
559            }
560            // Skip bytes inside escape block bodies
561            if let Some(end) = skip_to_byte {
562                if i < end {
563                    char_idx += 1;
564                    continue;
565                }
566                skip_to_byte = None;
567                word_start = i;
568            }
569            // Check if this byte position starts an escape block body
570            if let Some((_, end, content_start, raw_code)) = escape_ranges.iter().find(|(s, _, _, _)| i == *s) {
571                // Flush any pending word
572                if !current_word.is_empty() {
573                    items.push(WordItem {
574                        word: std::mem::take(&mut current_word),
575                        trailing_punct: None,
576                        start: word_start,
577                        end: i,
578                        punct_pos: None,
579                    });
580                }
581                // Emit the entire block as a single \x00ESC: marker
582                // Use content_start (after whitespace) for span alignment with Indent events
583                items.push(WordItem {
584                    word: format!("\x00ESC:{}", raw_code),
585                    trailing_punct: None,
586                    start: *content_start,
587                    end: *end,
588                    punct_pos: None,
589                });
590                skip_to_byte = Some(*end);
591                word_start = *end;
592                char_idx += 1;
593                continue;
594            }
595            let next_pos = i + c.len_utf8();
596            match c {
597                ' ' | '\t' | '\n' | '\r' => {
598                    if !current_word.is_empty() {
599                        items.push(WordItem {
600                            word: std::mem::take(&mut current_word),
601                            trailing_punct: None,
602                            start: word_start,
603                            end: i,
604                            punct_pos: None,
605                        });
606                    }
607                    word_start = next_pos;
608                }
609                '.' => {
610                    // Check if this is a decimal point (digit before and after)
611                    let prev_is_digit = !current_word.is_empty()
612                        && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
613                    let next_is_digit = char_idx + 1 < chars.len()
614                        && chars[char_idx + 1].is_ascii_digit();
615
616                    if prev_is_digit && next_is_digit {
617                        // This is a decimal point, include it in the current word
618                        current_word.push(c);
619                    } else {
620                        // This is a sentence period
621                        if !current_word.is_empty() {
622                            items.push(WordItem {
623                                word: std::mem::take(&mut current_word),
624                                trailing_punct: Some(c),
625                                start: word_start,
626                                end: i,
627                                punct_pos: Some(i),
628                            });
629                        } else {
630                            items.push(WordItem {
631                                word: String::new(),
632                                trailing_punct: Some(c),
633                                start: i,
634                                end: next_pos,
635                                punct_pos: Some(i),
636                            });
637                        }
638                        word_start = next_pos;
639                    }
640                }
641                '#' => {
642                    // Check for ## block header (markdown-style)
643                    if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
644                        // This is a ## block header
645                        // Skip the second # and capture the next word as a block header
646                        if !current_word.is_empty() {
647                            items.push(WordItem {
648                                word: std::mem::take(&mut current_word),
649                                trailing_punct: None,
650                                start: word_start,
651                                end: i,
652                                punct_pos: None,
653                            });
654                        }
655                        // Skip whitespace after ##
656                        let header_start = i;
657                        let mut j = char_idx + 2;
658                        while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
659                            j += 1;
660                        }
661                        // Capture the block type word
662                        let mut block_word = String::from("##");
663                        while j < chars.len() && chars[j].is_alphabetic() {
664                            block_word.push(chars[j]);
665                            j += 1;
666                        }
667                        if block_word.len() > 2 {
668                            items.push(WordItem {
669                                word: block_word,
670                                trailing_punct: None,
671                                start: header_start,
672                                end: header_start + (j - char_idx),
673                                punct_pos: None,
674                            });
675                        }
676                        skip_count = j - char_idx - 1;
677                        word_start = header_start + (j - char_idx);
678                    } else {
679                        // Single # - treat as comment, skip to end of line
680                        // Count how many chars to skip (without modifying char_idx here -
681                        // the main loop's skip handler will increment it)
682                        let mut look_ahead = char_idx + 1;
683                        while look_ahead < chars.len() && chars[look_ahead] != '\n' {
684                            skip_count += 1;
685                            look_ahead += 1;
686                        }
687                        if !current_word.is_empty() {
688                            items.push(WordItem {
689                                word: std::mem::take(&mut current_word),
690                                trailing_punct: None,
691                                start: word_start,
692                                end: i,
693                                punct_pos: None,
694                            });
695                        }
696                        word_start = look_ahead + 1; // Start after the newline
697                    }
698                }
699                // String literals: "hello world" or """multi-line"""
700                '"' => {
701                    // Push any pending word
702                    if !current_word.is_empty() {
703                        items.push(WordItem {
704                            word: std::mem::take(&mut current_word),
705                            trailing_punct: None,
706                            start: word_start,
707                            end: i,
708                            punct_pos: None,
709                        });
710                    }
711
712                    // Check for triple-quote: """
713                    if char_idx + 2 < chars.len() && chars[char_idx + 1] == '"' && chars[char_idx + 2] == '"' {
714                        let string_start = i;
715                        let mut j = char_idx + 3; // skip opening """
716                        // Skip optional newline after opening """
717                        if j < chars.len() && chars[j] == '\n' {
718                            j += 1;
719                        }
720                        let mut raw_content = String::new();
721                        // Scan until closing """
722                        while j < chars.len() {
723                            if j + 2 < chars.len() && chars[j] == '"' && chars[j + 1] == '"' && chars[j + 2] == '"' {
724                                break;
725                            }
726                            raw_content.push(chars[j]);
727                            j += 1;
728                        }
729                        // Strip trailing newline before closing """
730                        if raw_content.ends_with('\n') {
731                            raw_content.pop();
732                        }
733                        // Dedent: find minimum common indentation and strip it
734                        let dedented = Self::dedent_triple_quote(&raw_content);
735                        let end_pos = if j + 2 < chars.len() { j + 3 } else { chars.len() };
736                        items.push(WordItem {
737                            word: format!("\x00STR:{}", dedented),
738                            trailing_punct: None,
739                            start: string_start,
740                            end: end_pos,
741                            punct_pos: None,
742                        });
743                        // Skip past the closing """
744                        if j + 2 < chars.len() {
745                            skip_count = (j + 2) - char_idx;
746                        } else {
747                            skip_count = chars.len() - 1 - char_idx;
748                        }
749                        word_start = end_pos;
750                    } else {
751                        // Single-quoted string: scan until closing quote
752                        let string_start = i;
753                        let mut j = char_idx + 1;
754                        let mut string_content = String::new();
755                        while j < chars.len() && chars[j] != '"' {
756                            if chars[j] == '\\' && j + 1 < chars.len() {
757                                // Escape sequence - skip backslash, include next char
758                                j += 1;
759                                if j < chars.len() {
760                                    string_content.push(chars[j]);
761                                }
762                            } else {
763                                string_content.push(chars[j]);
764                            }
765                            j += 1;
766                        }
767
768                        // Create a special marker for string literals
769                        // We prefix with a special character to identify in tokenize()
770                        items.push(WordItem {
771                            word: format!("\x00STR:{}", string_content),
772                            trailing_punct: None,
773                            start: string_start,
774                            end: if j < chars.len() { j + 1 } else { j },
775                            punct_pos: None,
776                        });
777
778                        // Skip past the closing quote
779                        if j < chars.len() {
780                            skip_count = j - char_idx;
781                        } else {
782                            skip_count = j - char_idx - 1;
783                        }
784                        word_start = if j < chars.len() { j + 1 } else { j };
785                    }
786                }
787                // Character literals with backticks: `x`
788                '`' => {
789                    // Push any pending word
790                    if !current_word.is_empty() {
791                        items.push(WordItem {
792                            word: std::mem::take(&mut current_word),
793                            trailing_punct: None,
794                            start: word_start,
795                            end: i,
796                            punct_pos: None,
797                        });
798                    }
799
800                    // Scan for character content and closing backtick
801                    let char_start = i;
802                    let mut j = char_idx + 1;
803                    let mut char_content = String::new();
804
805                    if j < chars.len() {
806                        if chars[j] == '\\' && j + 1 < chars.len() {
807                            // Escape sequence
808                            j += 1;
809                            let escaped_char = match chars[j] {
810                                'n' => '\n',
811                                't' => '\t',
812                                'r' => '\r',
813                                '\\' => '\\',
814                                '`' => '`',
815                                '0' => '\0',
816                                c => c,
817                            };
818                            char_content.push(escaped_char);
819                            j += 1;
820                        } else if chars[j] != '`' {
821                            // Regular character
822                            char_content.push(chars[j]);
823                            j += 1;
824                        }
825                    }
826
827                    // Expect closing backtick
828                    if j < chars.len() && chars[j] == '`' {
829                        j += 1; // skip closing backtick
830                    }
831
832                    // Create a special marker for char literals
833                    items.push(WordItem {
834                        word: format!("\x00CHAR:{}", char_content),
835                        trailing_punct: None,
836                        start: char_start,
837                        end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
838                        punct_pos: None,
839                    });
840
841                    if j > char_idx + 1 {
842                        skip_count = j - char_idx - 1;
843                    }
844                    word_start = char_start + (j - char_idx);
845                }
846                // Handle -> as a single token for return type syntax
847                '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
848                    // Push any pending word first
849                    if !current_word.is_empty() {
850                        items.push(WordItem {
851                            word: std::mem::take(&mut current_word),
852                            trailing_punct: None,
853                            start: word_start,
854                            end: i,
855                            punct_pos: None,
856                        });
857                    }
858                    // Push -> as its own word
859                    items.push(WordItem {
860                        word: "->".to_string(),
861                        trailing_punct: None,
862                        start: i,
863                        end: i + 2,
864                        punct_pos: None,
865                    });
866                    skip_count = 1; // Skip the '>' character
867                    word_start = i + 2;
868                }
869                // Grand Challenge: Handle <= as a single token
870                '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
871                    if !current_word.is_empty() {
872                        items.push(WordItem {
873                            word: std::mem::take(&mut current_word),
874                            trailing_punct: None,
875                            start: word_start,
876                            end: i,
877                            punct_pos: None,
878                        });
879                    }
880                    items.push(WordItem {
881                        word: "<=".to_string(),
882                        trailing_punct: None,
883                        start: i,
884                        end: i + 2,
885                        punct_pos: None,
886                    });
887                    skip_count = 1;
888                    word_start = i + 2;
889                }
890                // Grand Challenge: Handle >= as a single token
891                '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
892                    if !current_word.is_empty() {
893                        items.push(WordItem {
894                            word: std::mem::take(&mut current_word),
895                            trailing_punct: None,
896                            start: word_start,
897                            end: i,
898                            punct_pos: None,
899                        });
900                    }
901                    items.push(WordItem {
902                        word: ">=".to_string(),
903                        trailing_punct: None,
904                        start: i,
905                        end: i + 2,
906                        punct_pos: None,
907                    });
908                    skip_count = 1;
909                    word_start = i + 2;
910                }
911                // Handle == as a single token
912                '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
913                    if !current_word.is_empty() {
914                        items.push(WordItem {
915                            word: std::mem::take(&mut current_word),
916                            trailing_punct: None,
917                            start: word_start,
918                            end: i,
919                            punct_pos: None,
920                        });
921                    }
922                    items.push(WordItem {
923                        word: "==".to_string(),
924                        trailing_punct: None,
925                        start: i,
926                        end: i + 2,
927                        punct_pos: None,
928                    });
929                    skip_count = 1;
930                    word_start = i + 2;
931                }
932                // Handle != as a single token
933                '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
934                    if !current_word.is_empty() {
935                        items.push(WordItem {
936                            word: std::mem::take(&mut current_word),
937                            trailing_punct: None,
938                            start: word_start,
939                            end: i,
940                            punct_pos: None,
941                        });
942                    }
943                    items.push(WordItem {
944                        word: "!=".to_string(),
945                        trailing_punct: None,
946                        start: i,
947                        end: i + 2,
948                        punct_pos: None,
949                    });
950                    skip_count = 1;
951                    word_start = i + 2;
952                }
953                // Special handling for '-' in ISO-8601 dates (YYYY-MM-DD)
954                '-' if Self::is_date_hyphen(&current_word, &chars, char_idx) => {
955                    // This hyphen is part of a date, include it in the word
956                    current_word.push(c);
957                }
958                // Special handling for ':' in time literals (9:30am, 11:45pm)
959                ':' if Self::is_time_colon(&current_word, &chars, char_idx) => {
960                    // This colon is part of a time, include it in the word
961                    current_word.push(c);
962                }
963                // Scientific notation: 4.84e+00, 1.66E-03, 2.5e-2
964                '+' | '-' if Self::is_exponent_sign(&current_word, &chars, char_idx) => {
965                    current_word.push(c);
966                }
967                '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
968                    if !current_word.is_empty() {
969                        items.push(WordItem {
970                            word: std::mem::take(&mut current_word),
971                            trailing_punct: Some(c),
972                            start: word_start,
973                            end: i,
974                            punct_pos: Some(i),
975                        });
976                    } else {
977                        items.push(WordItem {
978                            word: String::new(),
979                            trailing_punct: Some(c),
980                            start: i,
981                            end: next_pos,
982                            punct_pos: Some(i),
983                        });
984                    }
985                    word_start = next_pos;
986                }
987                '\'' => {
988                    // Handle contractions: expand "don't" → "do" + "not", etc.
989                    let remaining: String = chars[char_idx + 1..].iter().collect();
990                    let remaining_lower = remaining.to_lowercase();
991
992                    if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
993                       remaining_lower.starts_with("t,") || remaining_lower == "t" ||
994                       (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
995                        (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
996                        // This is a contraction ending in 't (don't, doesn't, won't, can't, etc.)
997                        let word_lower = current_word.to_lowercase();
998                        if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
999                            // do/does/did + not
1000                            let base = if word_lower == "don" { "do" }
1001                                      else if word_lower == "doesn" { "does" }
1002                                      else { "did" };
1003                            items.push(WordItem {
1004                                word: base.to_string(),
1005                                trailing_punct: None,
1006                                start: word_start,
1007                                end: i,
1008                                punct_pos: None,
1009                            });
1010                            items.push(WordItem {
1011                                word: "not".to_string(),
1012                                trailing_punct: None,
1013                                start: i,
1014                                end: i + 2,
1015                                punct_pos: None,
1016                            });
1017                            current_word.clear();
1018                            word_start = next_pos + 1;
1019                            skip_count = 1;
1020                        } else if word_lower == "won" {
1021                            // will + not
1022                            items.push(WordItem {
1023                                word: "will".to_string(),
1024                                trailing_punct: None,
1025                                start: word_start,
1026                                end: i,
1027                                punct_pos: None,
1028                            });
1029                            items.push(WordItem {
1030                                word: "not".to_string(),
1031                                trailing_punct: None,
1032                                start: i,
1033                                end: i + 2,
1034                                punct_pos: None,
1035                            });
1036                            current_word.clear();
1037                            word_start = next_pos + 1;
1038                            skip_count = 1;
1039                        } else if word_lower == "can" {
1040                            // cannot
1041                            items.push(WordItem {
1042                                word: "cannot".to_string(),
1043                                trailing_punct: None,
1044                                start: word_start,
1045                                end: i + 2,
1046                                punct_pos: None,
1047                            });
1048                            current_word.clear();
1049                            word_start = next_pos + 1;
1050                            skip_count = 1;
1051                        } else {
1052                            // Unknown contraction, split normally
1053                            if !current_word.is_empty() {
1054                                items.push(WordItem {
1055                                    word: std::mem::take(&mut current_word),
1056                                    trailing_punct: Some('\''),
1057                                    start: word_start,
1058                                    end: i,
1059                                    punct_pos: Some(i),
1060                                });
1061                            }
1062                            word_start = next_pos;
1063                        }
1064                    } else {
1065                        // Not a 't contraction, handle normally
1066                        if !current_word.is_empty() {
1067                            items.push(WordItem {
1068                                word: std::mem::take(&mut current_word),
1069                                trailing_punct: Some('\''),
1070                                start: word_start,
1071                                end: i,
1072                                punct_pos: Some(i),
1073                            });
1074                        }
1075                        word_start = next_pos;
1076                    }
1077                }
1078                c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
1079                    if current_word.is_empty() {
1080                        word_start = i;
1081                    }
1082                    current_word.push(c);
1083                }
1084                _ => {
1085                    word_start = next_pos;
1086                }
1087            }
1088            char_idx += 1;
1089        }
1090
1091        if !current_word.is_empty() {
1092            items.push(WordItem {
1093                word: current_word,
1094                trailing_punct: None,
1095                start: word_start,
1096                end: input.len(),
1097                punct_pos: None,
1098            });
1099        }
1100
1101        items
1102    }
1103
1104    fn peek_word(&self, offset: usize) -> Option<&str> {
1105        self.words.get(self.pos + offset).map(|w| w.word.as_str())
1106    }
1107
1108    /// Check if the previous word is a determiner (every, each, some, all, any, no, the, a, an).
1109    fn prev_token_is_determiner(&self) -> bool {
1110        if self.pos == 0 { return false; }
1111        if let Some(prev) = self.words.get(self.pos - 1) {
1112            matches!(prev.word.to_lowercase().as_str(),
1113                "every" | "each" | "some" | "all" | "any" | "no" | "the" | "a" | "an")
1114        } else {
1115            false
1116        }
1117    }
1118
1119    fn next_token_is_copula(&self) -> bool {
1120        if let Some(next) = self.peek_word(1) {
1121            matches!(next.to_lowercase().as_str(), "is" | "are" | "was" | "were")
1122        } else {
1123            false
1124        }
1125    }
1126
1127    fn peek_sequence(&self, expected: &[&str]) -> bool {
1128        for (i, &exp) in expected.iter().enumerate() {
1129            match self.peek_word(i + 1) {
1130                Some(w) if w.to_lowercase() == exp => continue,
1131                _ => return false,
1132            }
1133        }
1134        true
1135    }
1136
1137    fn consume_words(&mut self, count: usize) {
1138        self.pos += count;
1139    }
1140
1141    /// Tokenizes the input text and returns a vector of [`Token`]s.
1142    ///
1143    /// Each token includes its type, the interned lexeme, and the source
1144    /// span for error reporting. Words are classified according to the
1145    /// lexicon database with priority-based ambiguity resolution.
1146    ///
1147    /// # Returns
1148    ///
1149    /// A vector of tokens representing the input. The final token is
1150    /// typically `TokenType::Eof`.
1151    pub fn tokenize(&mut self) -> Vec<Token> {
1152        let mut tokens = Vec::new();
1153
1154        while self.pos < self.words.len() {
1155            let item = &self.words[self.pos];
1156            let word = item.word.clone();
1157            let trailing_punct = item.trailing_punct;
1158            let word_start = item.start;
1159            let word_end = item.end;
1160            let punct_pos = item.punct_pos;
1161
1162            if word.is_empty() {
1163                if let Some(punct) = trailing_punct {
1164                    let kind = match punct {
1165                        '(' => TokenType::LParen,
1166                        ')' => TokenType::RParen,
1167                        '[' => TokenType::LBracket,
1168                        ']' => TokenType::RBracket,
1169                        ',' => TokenType::Comma,
1170                        ':' => TokenType::Colon,
1171                        '.' | '?' => {
1172                            self.in_let_context = false;
1173                            TokenType::Period
1174                        }
1175                        '!' => TokenType::Exclamation,
1176                        '+' => TokenType::Plus,
1177                        '-' => TokenType::Minus,
1178                        '*' => TokenType::Star,
1179                        '/' => TokenType::Slash,
1180                        '%' => TokenType::Percent,
1181                        '<' => TokenType::Lt,
1182                        '>' => TokenType::Gt,
1183                        '=' => TokenType::Assign,
1184                        _ => {
1185                            self.pos += 1;
1186                            continue;
1187                        }
1188                    };
1189                    let lexeme = self.interner.intern(&punct.to_string());
1190                    let span = Span::new(word_start, word_end);
1191                    tokens.push(Token::new(kind, lexeme, span));
1192                }
1193                self.pos += 1;
1194                continue;
1195            }
1196
1197            // Check for string literal marker (pre-tokenized in Stage 1)
1198            if word.starts_with("\x00STR:") {
1199                let content = &word[5..]; // Skip the marker prefix
1200                let span = Span::new(word_start, word_end);
1201                if Self::has_unescaped_brace(content) {
1202                    let sym = self.interner.intern(content);
1203                    tokens.push(Token::new(TokenType::InterpolatedString(sym), sym, span));
1204                } else {
1205                    // Collapse {{ → { and }} → } for plain strings
1206                    let normalized = content.replace("{{", "{").replace("}}", "}");
1207                    let sym = self.interner.intern(&normalized);
1208                    tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
1209                }
1210                self.pos += 1;
1211                continue;
1212            }
1213
1214            // Check for character literal marker
1215            if word.starts_with("\x00CHAR:") {
1216                let content = &word[6..]; // Skip the marker prefix
1217                let sym = self.interner.intern(content);
1218                let span = Span::new(word_start, word_end);
1219                tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
1220                self.pos += 1;
1221                continue;
1222            }
1223
1224            // Check for escape block marker (pre-captured raw foreign code)
1225            if word.starts_with("\x00ESC:") {
1226                let content = &word[5..]; // Skip the "\x00ESC:" prefix
1227                let sym = self.interner.intern(content);
1228                let span = Span::new(word_start, word_end);
1229                tokens.push(Token::new(TokenType::EscapeBlock(sym), sym, span));
1230                self.pos += 1;
1231                continue;
1232            }
1233
1234            let kind = self.classify_with_lookahead(&word);
1235            let lexeme = self.interner.intern(&word);
1236            let span = Span::new(word_start, word_end);
1237            tokens.push(Token::new(kind, lexeme, span));
1238
1239            if let Some(punct) = trailing_punct {
1240                if punct == '\'' {
1241                    if let Some(next_item) = self.words.get(self.pos + 1) {
1242                        if next_item.word.to_lowercase() == "s" {
1243                            let poss_lexeme = self.interner.intern("'s");
1244                            let poss_start = punct_pos.unwrap_or(word_end);
1245                            let poss_end = next_item.end;
1246                            tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
1247                            self.pos += 1;
1248                            if let Some(s_punct) = next_item.trailing_punct {
1249                                let kind = match s_punct {
1250                                    '(' => TokenType::LParen,
1251                                    ')' => TokenType::RParen,
1252                                    '[' => TokenType::LBracket,
1253                                    ']' => TokenType::RBracket,
1254                                    ',' => TokenType::Comma,
1255                                    ':' => TokenType::Colon,
1256                                    '.' | '?' => TokenType::Period,
1257                                    '!' => TokenType::Exclamation,
1258                                    '+' => TokenType::Plus,
1259                                    '-' => TokenType::Minus,
1260                                    '*' => TokenType::Star,
1261                                    '/' => TokenType::Slash,
1262                                    '%' => TokenType::Percent,
1263                                    '<' => TokenType::Lt,
1264                                    '>' => TokenType::Gt,
1265                                    '=' => TokenType::Assign,
1266                                    _ => {
1267                                        self.pos += 1;
1268                                        continue;
1269                                    }
1270                                };
1271                                let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
1272                                let lexeme = self.interner.intern(&s_punct.to_string());
1273                                tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
1274                            }
1275                            self.pos += 1;
1276                            continue;
1277                        }
1278                    }
1279                    self.pos += 1;
1280                    continue;
1281                }
1282
1283                let kind = match punct {
1284                    '(' => TokenType::LParen,
1285                    ')' => TokenType::RParen,
1286                    '[' => TokenType::LBracket,
1287                    ']' => TokenType::RBracket,
1288                    ',' => TokenType::Comma,
1289                    ':' => TokenType::Colon,
1290                    '.' | '?' => {
1291                        self.in_let_context = false;
1292                        TokenType::Period
1293                    }
1294                    '!' => TokenType::Exclamation,
1295                    '+' => TokenType::Plus,
1296                    '-' => TokenType::Minus,
1297                    '*' => TokenType::Star,
1298                    '/' => TokenType::Slash,
1299                    '%' => TokenType::Percent,
1300                    '<' => TokenType::Lt,
1301                    '>' => TokenType::Gt,
1302                    '=' => TokenType::Assign,
1303                    _ => {
1304                        self.pos += 1;
1305                        continue;
1306                    }
1307                };
1308                let p_start = punct_pos.unwrap_or(word_end);
1309                let lexeme = self.interner.intern(&punct.to_string());
1310                tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1311            }
1312
1313            self.pos += 1;
1314        }
1315
1316        let eof_lexeme = self.interner.intern("");
1317        let eof_span = Span::new(self.input_len, self.input_len);
1318        tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1319
1320        self.insert_indentation_tokens(tokens)
1321    }
1322
1323    /// Insert Indent/Dedent tokens using LineLexer's two-pass architecture (Spec §2.5.2).
1324    ///
1325    /// Phase 1: LineLexer determines the structural layout (where indents/dedents occur)
1326    /// Phase 2: We correlate these with word token positions
1327    fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1328        let mut result = Vec::new();
1329        let empty_sym = self.interner.intern("");
1330
1331        // Phase 1: Run LineLexer to determine structural positions
1332        let line_lexer = LineLexer::new(&self.source);
1333        let line_tokens: Vec<LineToken> = line_lexer.collect();
1334
1335        // Build a list of (byte_position, is_indent) for structural tokens
1336        // Position is where the NEXT Content starts after the Indent/Dedent
1337        let mut structural_events: Vec<(usize, bool)> = Vec::new(); // (byte_pos, true=Indent, false=Dedent)
1338        let mut pending_indents = 0usize;
1339        let mut pending_dedents = 0usize;
1340
1341        for line_token in &line_tokens {
1342            match line_token {
1343                LineToken::Indent => {
1344                    pending_indents += 1;
1345                }
1346                LineToken::Dedent => {
1347                    pending_dedents += 1;
1348                }
1349                LineToken::Content { start, .. } => {
1350                    // Emit pending dedents first (they come BEFORE the content)
1351                    for _ in 0..pending_dedents {
1352                        structural_events.push((*start, false)); // false = Dedent
1353                    }
1354                    pending_dedents = 0;
1355
1356                    // Emit pending indents (they also come BEFORE the content)
1357                    for _ in 0..pending_indents {
1358                        structural_events.push((*start, true)); // true = Indent
1359                    }
1360                    pending_indents = 0;
1361                }
1362                LineToken::Newline => {}
1363            }
1364        }
1365
1366        // Handle any remaining dedents at EOF
1367        for _ in 0..pending_dedents {
1368            structural_events.push((self.input_len, false));
1369        }
1370
1371        // Filter out structural events from within escape block bodies.
1372        // The LineLexer sees raw Rust code lines and generates spurious Indent/Dedent
1373        // events for their indentation changes. We keep exactly the boundary events
1374        // (Indent at body start, Dedent at body end) but remove internal ones.
1375        if !self.escape_body_ranges.is_empty() {
1376            // For each escape body range, find the first Indent at the body start and
1377            // track that we're inside the range. Filter out all events strictly inside
1378            // the range except for the first Indent and events at/after the end.
1379            let mut filtered = Vec::new();
1380            for &(pos, is_indent) in &structural_events {
1381                let is_inside_escape_body = self.escape_body_ranges.iter().any(|(start, end)| {
1382                    // Strictly inside the body (not at start boundary and not at/after end)
1383                    pos > *start && pos < *end
1384                });
1385                if !is_inside_escape_body {
1386                    filtered.push((pos, is_indent));
1387                }
1388            }
1389            structural_events = filtered;
1390        }
1391
1392        // Filter out structural events from within multi-line string literals.
1393        // Triple-quote strings span multiple lines; their internal indentation
1394        // must not generate Indent/Dedent tokens.
1395        {
1396            let string_spans: Vec<(usize, usize)> = tokens.iter()
1397                .filter(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)))
1398                .filter(|t| t.span.end - t.span.start > 6) // only multi-line strings (""" adds >=6 chars)
1399                .map(|t| (t.span.start, t.span.end))
1400                .collect();
1401            if !string_spans.is_empty() {
1402                structural_events.retain(|&(pos, _)| {
1403                    !string_spans.iter().any(|(start, end)| pos > *start && pos < *end)
1404                });
1405            }
1406        }
1407
1408        // Sort events by position, with dedents before indents at same position
1409        structural_events.sort_by(|a, b| {
1410            if a.0 != b.0 {
1411                a.0.cmp(&b.0)
1412            } else {
1413                // Dedents (false) before Indents (true) at same position
1414                a.1.cmp(&b.1)
1415            }
1416        });
1417
1418        // Phase 2: Insert structural tokens at the right positions
1419        // Strategy: For each word token, check if any structural events should be inserted
1420        // before it (based on byte position)
1421
1422        let mut event_idx = 0;
1423        let mut last_colon_pos: Option<usize> = None;
1424
1425        for token in tokens.iter() {
1426            let token_start = token.span.start;
1427
1428            // Insert any structural tokens that should come BEFORE this token
1429            while event_idx < structural_events.len() {
1430                let (event_pos, is_indent) = structural_events[event_idx];
1431
1432                // Insert structural tokens before this token if the event position <= token start
1433                if event_pos <= token_start {
1434                    let span = if is_indent {
1435                        // Indent is inserted after the preceding Colon
1436                        Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1437                    } else {
1438                        Span::new(event_pos, event_pos)
1439                    };
1440                    let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1441                    result.push(Token::new(kind, empty_sym, span));
1442                    event_idx += 1;
1443                } else {
1444                    break;
1445                }
1446            }
1447
1448            result.push(token.clone());
1449
1450            // Track colon positions for Indent span calculation
1451            if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1452                last_colon_pos = Some(token.span.end);
1453            }
1454        }
1455
1456        // Insert any remaining structural tokens (typically Dedents at EOF)
1457        while event_idx < structural_events.len() {
1458            let (event_pos, is_indent) = structural_events[event_idx];
1459            let span = Span::new(event_pos, event_pos);
1460            let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1461            result.push(Token::new(kind, empty_sym, span));
1462            event_idx += 1;
1463        }
1464
1465        // Ensure EOF is at the end
1466        let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1467        if let Some(pos) = eof_pos {
1468            let eof = result.remove(pos);
1469            result.push(eof);
1470        }
1471
1472        result
1473    }
1474
1475    /// Check if position is at end of line (only whitespace until newline)
1476    fn is_end_of_line(&self, from_pos: usize) -> bool {
1477        let bytes = self.source.as_bytes();
1478        let mut pos = from_pos;
1479        while pos < bytes.len() {
1480            match bytes[pos] {
1481                b' ' | b'\t' => pos += 1,
1482                b'\n' => return true,
1483                _ => return false,
1484            }
1485        }
1486        true // End of input is also end of line
1487    }
1488
1489    fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1490        let bytes = self.source.as_bytes();
1491        let mut pos = from_pos;
1492
1493        while pos < bytes.len() && bytes[pos] != b'\n' {
1494            pos += 1;
1495        }
1496
1497        if pos >= bytes.len() {
1498            return None;
1499        }
1500
1501        pos += 1;
1502
1503        let mut indent = 0;
1504        while pos < bytes.len() {
1505            match bytes[pos] {
1506                b' ' => indent += 1,
1507                b'\t' => indent += 4,
1508                b'\n' => {
1509                    indent = 0;
1510                }
1511                _ => break,
1512            }
1513            pos += 1;
1514        }
1515
1516        if pos >= bytes.len() {
1517            return None;
1518        }
1519
1520        Some(indent)
1521    }
1522
1523    fn word_to_number(word: &str) -> Option<u32> {
1524        lexicon::word_to_number(&word.to_lowercase())
1525    }
1526
1527    /// Check if a hyphen at the current position is part of an ISO-8601 date.
1528    ///
1529    /// Detects patterns like:
1530    /// - "2026-" followed by "05-20" → first hyphen of date
1531    /// - "2026-05-" followed by "20" → second hyphen of date
1532    fn is_date_hyphen(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1533        // Current word must be all digits (year or year-month)
1534        let word_chars: Vec<char> = current_word.chars().collect();
1535
1536        // Check for first hyphen pattern: YYYY- followed by MM-DD
1537        if word_chars.len() == 4 && word_chars.iter().all(|c| c.is_ascii_digit()) {
1538            // Check if followed by exactly 2 digits, hyphen, 2 digits
1539            if char_idx + 5 < chars.len()
1540                && chars[char_idx + 1].is_ascii_digit()
1541                && chars[char_idx + 2].is_ascii_digit()
1542                && chars[char_idx + 3] == '-'
1543                && chars[char_idx + 4].is_ascii_digit()
1544                && chars[char_idx + 5].is_ascii_digit()
1545            {
1546                return true;
1547            }
1548        }
1549
1550        // Check for second hyphen pattern: YYYY-MM- followed by DD
1551        if word_chars.len() == 7
1552            && word_chars[0..4].iter().all(|c| c.is_ascii_digit())
1553            && word_chars[4] == '-'
1554            && word_chars[5..7].iter().all(|c| c.is_ascii_digit())
1555        {
1556            // Check if followed by exactly 2 digits
1557            if char_idx + 2 < chars.len()
1558                && chars[char_idx + 1].is_ascii_digit()
1559                && chars[char_idx + 2].is_ascii_digit()
1560            {
1561                // Make sure we're not followed by more digits (would be a longer number)
1562                let next_not_digit = char_idx + 3 >= chars.len()
1563                    || !chars[char_idx + 3].is_ascii_digit();
1564                if next_not_digit {
1565                    return true;
1566                }
1567            }
1568        }
1569
1570        false
1571    }
1572
1573    /// Check if a colon is part of a time literal (e.g., 9:30am, 11:45pm).
1574    ///
1575    /// Detects patterns like:
1576    /// - "9:" followed by "30am" or "30pm"
1577    /// - "11:" followed by "45pm"
1578    fn is_time_colon(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1579        // Current word must be 1-2 digits (hour)
1580        let word_chars: Vec<char> = current_word.chars().collect();
1581        if word_chars.is_empty() || word_chars.len() > 2 {
1582            return false;
1583        }
1584        if !word_chars.iter().all(|c| c.is_ascii_digit()) {
1585            return false;
1586        }
1587
1588        // Check if followed by exactly 2 digits and then "am" or "pm"
1589        if char_idx + 4 < chars.len()
1590            && chars[char_idx + 1].is_ascii_digit()
1591            && chars[char_idx + 2].is_ascii_digit()
1592        {
1593            // Check for "am" or "pm" suffix
1594            let next_two: String = chars[char_idx + 3..char_idx + 5].iter().collect();
1595            let lower = next_two.to_lowercase();
1596            if lower == "am" || lower == "pm" {
1597                // Make sure we're not followed by more alphabetic chars
1598                let after_suffix = char_idx + 5 >= chars.len()
1599                    || !chars[char_idx + 5].is_alphabetic();
1600                if after_suffix {
1601                    return true;
1602                }
1603            }
1604        }
1605
1606        false
1607    }
1608
1609    /// Check if a string contains an unescaped `{` (i.e., not part of `{{`).
1610    /// Used to distinguish `InterpolatedString` from `StringLiteral`.
1611    fn has_unescaped_brace(content: &str) -> bool {
1612        let bytes = content.as_bytes();
1613        let mut i = 0;
1614        while i < bytes.len() {
1615            if bytes[i] == b'{' {
1616                if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
1617                    i += 2;
1618                } else {
1619                    return true;
1620                }
1621            } else {
1622                i += 1;
1623            }
1624        }
1625        false
1626    }
1627
1628    /// Check if a `+` or `-` at the current position is the sign of a scientific notation exponent.
1629    ///
1630    /// Detects patterns like:
1631    /// - "4.84e+" followed by "00" → exponent sign in `4.84e+00`
1632    /// - "2.5e-" followed by "2"  → exponent sign in `2.5e-2`
1633    fn is_exponent_sign(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1634        // Word must end with e/E
1635        if !current_word.ends_with('e') && !current_word.ends_with('E') {
1636            return false;
1637        }
1638        // Before e/E must contain a digit (ensures it's a number, not a bare "e")
1639        let before_e = &current_word[..current_word.len() - 1];
1640        if before_e.is_empty() || !before_e.chars().next().unwrap().is_ascii_digit() {
1641            return false;
1642        }
1643        // Next char must be a digit (the exponent value)
1644        char_idx + 1 < chars.len() && chars[char_idx + 1].is_ascii_digit()
1645    }
1646
1647    /// Dedent a triple-quoted string: strip the common leading whitespace from each line.
1648    /// Joins lines with literal newline characters (not escape sequences).
1649    fn dedent_triple_quote(raw: &str) -> String {
1650        let lines: Vec<&str> = raw.lines().collect();
1651        if lines.is_empty() {
1652            return String::new();
1653        }
1654        // Find minimum indentation of non-empty lines
1655        let min_indent = lines.iter()
1656            .filter(|l| !l.trim().is_empty())
1657            .map(|l| l.len() - l.trim_start().len())
1658            .min()
1659            .unwrap_or(0);
1660        // Strip that indentation and join with actual newlines
1661        lines.iter()
1662            .map(|l| {
1663                if l.len() >= min_indent {
1664                    &l[min_indent..]
1665                } else {
1666                    l.trim()
1667                }
1668            })
1669            .collect::<Vec<_>>()
1670            .join("\n")
1671    }
1672
1673    fn is_numeric_literal(word: &str) -> bool {
1674        if word.is_empty() {
1675            return false;
1676        }
1677        let chars: Vec<char> = word.chars().collect();
1678        let first = chars[0];
1679        if first.is_ascii_digit() {
1680            // Numeric literal: starts with digit (may have underscore separators like 1_000)
1681            return true;
1682        }
1683        // Symbolic numbers: only recognize known mathematical symbols
1684        // (aleph, omega, beth) followed by underscore and digits
1685        if let Some(underscore_pos) = word.rfind('_') {
1686            let before_underscore = &word[..underscore_pos];
1687            let after_underscore = &word[underscore_pos + 1..];
1688            // Must be a known mathematical symbol prefix AND digits after underscore
1689            let is_math_symbol = matches!(
1690                before_underscore.to_lowercase().as_str(),
1691                "aleph" | "omega" | "beth"
1692            );
1693            if is_math_symbol
1694                && !after_underscore.is_empty()
1695                && after_underscore.chars().all(|c| c.is_ascii_digit())
1696            {
1697                return true;
1698            }
1699        }
1700        false
1701    }
1702
1703    /// Parse a duration literal with SI suffix.
1704    ///
1705    /// Returns Some((nanoseconds, unit_str)) if the word is a valid duration literal,
1706    /// None otherwise.
1707    ///
1708    /// Supported suffixes:
1709    /// - ns: nanoseconds
1710    /// - us, μs: microseconds
1711    /// - ms: milliseconds
1712    /// - s, sec: seconds
1713    /// - min: minutes
1714    /// - h, hr: hours
1715    fn parse_duration_literal(word: &str) -> Option<(i64, &str)> {
1716        if word.is_empty() || !word.chars().next()?.is_ascii_digit() {
1717            return None;
1718        }
1719
1720        // SI suffix table with multipliers to nanoseconds
1721        const SUFFIXES: &[(&str, i64)] = &[
1722            ("ns", 1),
1723            ("μs", 1_000),
1724            ("us", 1_000),
1725            ("ms", 1_000_000),
1726            ("sec", 1_000_000_000),
1727            ("s", 1_000_000_000),
1728            ("min", 60_000_000_000),
1729            ("hr", 3_600_000_000_000),
1730            ("h", 3_600_000_000_000),
1731        ];
1732
1733        // Try each suffix (longer suffixes first to avoid partial matches)
1734        for (suffix, multiplier) in SUFFIXES {
1735            if word.ends_with(suffix) {
1736                let num_part = &word[..word.len() - suffix.len()];
1737                // Parse the numeric part (may have underscore separators)
1738                let cleaned: String = num_part.chars().filter(|c| *c != '_').collect();
1739                if let Ok(n) = cleaned.parse::<i64>() {
1740                    return Some((n.saturating_mul(*multiplier), *suffix));
1741                }
1742            }
1743        }
1744
1745        None
1746    }
1747
1748    /// Parse an ISO-8601 date literal (YYYY-MM-DD).
1749    ///
1750    /// Returns Some(days_since_epoch) if the word is a valid date literal,
1751    /// None otherwise.
1752    fn parse_date_literal(word: &str) -> Option<i32> {
1753        // Must match pattern: YYYY-MM-DD
1754        if word.len() != 10 {
1755            return None;
1756        }
1757
1758        let bytes = word.as_bytes();
1759
1760        // Check format: 4 digits, hyphen, 2 digits, hyphen, 2 digits
1761        if bytes[4] != b'-' || bytes[7] != b'-' {
1762            return None;
1763        }
1764
1765        // Parse year, month, day
1766        let year: i32 = word[0..4].parse().ok()?;
1767        let month: u32 = word[5..7].parse().ok()?;
1768        let day: u32 = word[8..10].parse().ok()?;
1769
1770        // Basic validation
1771        if month < 1 || month > 12 || day < 1 || day > 31 {
1772            return None;
1773        }
1774
1775        // Convert to days since Unix epoch using Howard Hinnant's algorithm
1776        // https://howardhinnant.github.io/date_algorithms.html
1777        let y = if month <= 2 { year - 1 } else { year };
1778        let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
1779        let yoe = (y - era * 400) as u32;
1780        let m = month;
1781        let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + day - 1;
1782        let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1783        let days = era * 146097 + doe as i32 - 719468;
1784
1785        Some(days)
1786    }
1787
1788    /// Parse a time-of-day literal.
1789    ///
1790    /// Supported formats:
1791    /// - 12-hour with am/pm: "4pm", "9am", "12pm"
1792    /// - 12-hour with minutes: "9:30am", "11:45pm"
1793    /// - Special words: "noon" (12:00), "midnight" (00:00)
1794    ///
1795    /// Returns Some(nanos_from_midnight) if valid, None otherwise.
1796    fn parse_time_literal(word: &str) -> Option<i64> {
1797        let lower = word.to_lowercase();
1798
1799        // Handle special time words
1800        if lower == "noon" {
1801            return Some(12i64 * 3600 * 1_000_000_000);
1802        }
1803        if lower == "midnight" {
1804            return Some(0);
1805        }
1806
1807        // Handle 12-hour formats: "4pm", "9am", "9:30am", "11:45pm"
1808        let is_pm = lower.ends_with("pm");
1809        let is_am = lower.ends_with("am");
1810
1811        if !is_pm && !is_am {
1812            return None;
1813        }
1814
1815        // Strip the am/pm suffix
1816        let time_part = &lower[..lower.len() - 2];
1817
1818        // Check for hour:minute format
1819        let (hour, minute): (i64, i64) = if let Some(colon_idx) = time_part.find(':') {
1820            let hour_str = &time_part[..colon_idx];
1821            let min_str = &time_part[colon_idx + 1..];
1822            let h: i64 = hour_str.parse().ok()?;
1823            let m: i64 = min_str.parse().ok()?;
1824            (h, m)
1825        } else {
1826            // Just hour: "4pm", "9am"
1827            let h: i64 = time_part.parse().ok()?;
1828            (h, 0)
1829        };
1830
1831        // Validate ranges
1832        if hour < 1 || hour > 12 || minute < 0 || minute > 59 {
1833            return None;
1834        }
1835
1836        // Convert to 24-hour format
1837        let hour_24 = if is_am {
1838            if hour == 12 { 0 } else { hour }  // 12am = midnight = 0
1839        } else {
1840            if hour == 12 { 12 } else { hour + 12 }  // 12pm = noon = 12, 4pm = 16
1841        };
1842
1843        // Convert to nanoseconds from midnight
1844        let nanos = (hour_24 * 3600 + minute * 60) * 1_000_000_000;
1845        Some(nanos)
1846    }
1847
1848    fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1849        // Handle block headers (##Theorem, ##Main, etc.)
1850        if word.starts_with("##") {
1851            let block_name = &word[2..];
1852            let block_type = match block_name.to_lowercase().as_str() {
1853                "theorem" => BlockType::Theorem,
1854                "main" => BlockType::Main,
1855                "definition" => BlockType::Definition,
1856                "proof" => BlockType::Proof,
1857                "example" => BlockType::Example,
1858                "logic" => BlockType::Logic,
1859                "note" => BlockType::Note,
1860                "to" => BlockType::Function,  // Function definition block
1861                "a" | "an" => BlockType::TypeDef,  // Inline type definitions: ## A Point has:
1862                "policy" => BlockType::Policy,  // Security policy definitions
1863                "requires" => BlockType::Requires,  // External crate dependencies
1864                "hardware" => BlockType::Hardware,  // Signal declarations
1865                "property" => BlockType::Property,  // Temporal assertions
1866                "no" => BlockType::No,  // Optimization annotation: ## No Memo, ## No TCO, etc.
1867                _ => BlockType::Note, // Default unknown block types to Note
1868            };
1869
1870            // Update lexer mode based on block type
1871            self.mode = match block_type {
1872                BlockType::Main | BlockType::Function => LexerMode::Imperative,
1873                _ => LexerMode::Declarative,
1874            };
1875
1876            return TokenType::BlockHeader { block_type };
1877        }
1878
1879        let lower = word.to_lowercase();
1880
1881        if lower == "each" && self.peek_sequence(&["other"]) {
1882            self.consume_words(1);
1883            return TokenType::Reciprocal;
1884        }
1885
1886        if lower == "to" {
1887            if let Some(next) = self.peek_word(1) {
1888                if self.is_verb_like(next) {
1889                    return TokenType::To;
1890                }
1891            }
1892            let sym = self.interner.intern("to");
1893            return TokenType::Preposition(sym);
1894        }
1895
1896        if lower == "at" {
1897            if let Some(next) = self.peek_word(1) {
1898                let next_lower = next.to_lowercase();
1899                if next_lower == "least" {
1900                    if let Some(num_word) = self.peek_word(2) {
1901                        if let Some(n) = Self::word_to_number(num_word) {
1902                            self.consume_words(2);
1903                            return TokenType::AtLeast(n);
1904                        }
1905                    }
1906                }
1907                if next_lower == "most" {
1908                    if let Some(num_word) = self.peek_word(2) {
1909                        if let Some(n) = Self::word_to_number(num_word) {
1910                            self.consume_words(2);
1911                            return TokenType::AtMost(n);
1912                        }
1913                    }
1914                }
1915            }
1916        }
1917
1918        // "Exactly N" → Cardinal(N) — same as bare number but explicit
1919        if lower == "exactly" {
1920            if let Some(num_word) = self.peek_word(1) {
1921                if let Some(n) = Self::word_to_number(num_word) {
1922                    self.consume_words(1);
1923                    return TokenType::Cardinal(n);
1924                }
1925            }
1926        }
1927
1928        if let Some(n) = Self::word_to_number(&lower) {
1929            return TokenType::Cardinal(n);
1930        }
1931
1932        // Check for duration literal first (e.g., "500ms", "2s", "50ns")
1933        if let Some((nanos, unit)) = Self::parse_duration_literal(word) {
1934            let unit_sym = self.interner.intern(unit);
1935            return TokenType::DurationLiteral {
1936                nanos,
1937                original_unit: unit_sym,
1938            };
1939        }
1940
1941        // Check for ISO-8601 date literal (e.g., "2026-05-20")
1942        if let Some(days) = Self::parse_date_literal(word) {
1943            return TokenType::DateLiteral { days };
1944        }
1945
1946        // Check for time-of-day literal (e.g., "4pm", "9:30am", "noon", "midnight")
1947        if let Some(nanos_from_midnight) = Self::parse_time_literal(word) {
1948            return TokenType::TimeLiteral { nanos_from_midnight };
1949        }
1950
1951        if Self::is_numeric_literal(word) {
1952            let sym = self.interner.intern(word);
1953            return TokenType::Number(sym);
1954        }
1955
1956        if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1957            self.consume_words(3);
1958            return TokenType::Iff;
1959        }
1960
1961        if lower == "is" {
1962            if self.peek_sequence(&["equal", "to"]) {
1963                self.consume_words(2);
1964                return TokenType::Identity;
1965            }
1966            if self.peek_sequence(&["identical", "to"]) {
1967                self.consume_words(2);
1968                return TokenType::Identity;
1969            }
1970        }
1971
1972        if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1973            // Capitalized "A" or "An" - disambiguate article vs proper name
1974            // Heuristic: articles are followed by nouns/adjectives, not verbs or keywords
1975            if let Some(next) = self.peek_word(1) {
1976                let next_lower = next.to_lowercase();
1977                let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1978
1979                // If followed by logical keyword, treat as proper name (propositional variable)
1980                if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1981                    let sym = self.interner.intern(word);
1982                    return TokenType::ProperName(sym);
1983                }
1984
1985                // If next word is ONLY a verb (like "has", "is", "ran"), A is likely a name
1986                // Exception: gerunds (like "running") can follow articles
1987                // Exception: words in disambiguation_not_verbs (like "red") are not verbs
1988                // Exception: words that are also nouns/adjectives (like "fire") can follow articles
1989                let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1990                    && !lexicon::is_disambiguation_not_verb(&next_lower);
1991                let is_gerund = next_lower.ends_with("ing");
1992                let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1993                if is_verb && !is_gerund && !is_also_noun_or_adj {
1994                    let sym = self.interner.intern(word);
1995                    return TokenType::ProperName(sym);
1996                }
1997
1998                // Definition pattern: "A [TypeName] is a..." or "A [TypeName] has:" - treat A as article
1999                // even when TypeName is capitalized and unknown
2000                if let Some(third) = self.peek_word(2) {
2001                    let third_lower = third.to_lowercase();
2002                    // "has" for struct definitions: "A Point has:"
2003                    if third_lower == "is" || third_lower == "are" || third_lower == "has" {
2004                        return TokenType::Article(Definiteness::Indefinite);
2005                    }
2006                }
2007
2008                // It's an article if next word is:
2009                // - A known noun or adjective, or
2010                // - Lowercase (likely a common word we don't recognize)
2011                let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
2012                if is_content_word || next_starts_lowercase {
2013                    return TokenType::Article(Definiteness::Indefinite);
2014                }
2015            }
2016            let sym = self.interner.intern(word);
2017            return TokenType::ProperName(sym);
2018        }
2019
2020        self.classify_word(word)
2021    }
2022
2023    fn is_noun_like(&self, word: &str) -> bool {
2024        if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
2025            return true;
2026        }
2027        if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
2028            return true;
2029        }
2030        false
2031    }
2032
2033    fn is_adjective_like(&self, word: &str) -> bool {
2034        lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
2035    }
2036
2037    fn classify_word(&mut self, word: &str) -> TokenType {
2038        let lower = word.to_lowercase();
2039        let first_char = word.chars().next().unwrap();
2040
2041        // Disambiguate "that" as determiner vs complementizer
2042        // "that dog" → Article(Distal), "I know that he ran" → That (complementizer)
2043        if lower == "that" {
2044            if let Some(next) = self.peek_word(1) {
2045                let next_lower = next.to_lowercase();
2046                if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
2047                    return TokenType::Article(Definiteness::Distal);
2048                }
2049            }
2050        }
2051
2052        // Arrow token for return type syntax
2053        if word == "->" {
2054            return TokenType::Arrow;
2055        }
2056
2057        // Grand Challenge: Comparison operator tokens
2058        if word == "<=" {
2059            return TokenType::LtEq;
2060        }
2061        if word == ">=" {
2062            return TokenType::GtEq;
2063        }
2064        if word == "==" {
2065            return TokenType::EqEq;
2066        }
2067        if word == "!=" {
2068            return TokenType::NotEq;
2069        }
2070        if word == "<" {
2071            return TokenType::Lt;
2072        }
2073        if word == ">" {
2074            return TokenType::Gt;
2075        }
2076        // Single = for assignment (must come after == check)
2077        if word == "=" {
2078            return TokenType::Assign;
2079        }
2080
2081        if let Some(kind) = lexicon::lookup_keyword(&lower) {
2082            return kind;
2083        }
2084
2085        if let Some(kind) = lexicon::lookup_pronoun(&lower) {
2086            return kind;
2087        }
2088
2089        if let Some(def) = lexicon::lookup_article(&lower) {
2090            return TokenType::Article(def);
2091        }
2092
2093        if let Some(time) = lexicon::lookup_auxiliary(&lower) {
2094            return TokenType::Auxiliary(time);
2095        }
2096
2097        // Handle imperative keywords that might conflict with prepositions
2098        match lower.as_str() {
2099            "call" => return TokenType::Call,
2100            "in" if self.mode == LexerMode::Imperative => return TokenType::In,
2101            // Zone keywords (must come before is_preposition check)
2102            "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2103            // "at" for chunk access (must come before is_preposition check)
2104            "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2105            // "into" for pipe send (must come before is_preposition check)
2106            "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2107            // Temporal span operator (must come before is_preposition check)
2108            "before" => return TokenType::Before,
2109            _ => {}
2110        }
2111
2112        if lexicon::is_preposition(&lower) {
2113            let sym = self.interner.intern(&lower);
2114            return TokenType::Preposition(sym);
2115        }
2116
2117        match lower.as_str() {
2118            "equals" => return TokenType::Equals,
2119            "item" => return TokenType::Item,
2120            "items" => return TokenType::Items,
2121            // Mutability keyword for `mut x = 5` syntax
2122            "mut" if self.mode == LexerMode::Imperative => return TokenType::Mut,
2123            "let" => {
2124                self.in_let_context = true;
2125                return TokenType::Let;
2126            }
2127            "set" => {
2128                // Check if "set" is used as a type (followed by "of") - "Set of Int"
2129                // This takes priority over the assignment keyword
2130                if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
2131                    // It's a type like "Set of Int" - don't return keyword, let it be a noun
2132                } else if self.mode == LexerMode::Imperative {
2133                    // In Imperative mode, treat "set" as the assignment keyword
2134                    return TokenType::Set;
2135                } else {
2136                    // In Declarative mode, check positions 2-5 for "to"
2137                    // (handles field access like "set p's x to")
2138                    for offset in 2..=5 {
2139                        if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
2140                            return TokenType::Set;
2141                        }
2142                    }
2143                }
2144            }
2145            "return" => return TokenType::Return,
2146            "break" => return TokenType::Break,
2147            "xor" => return TokenType::Xor,
2148            "shifted" => return TokenType::Shifted,
2149            "be" if self.in_let_context => {
2150                self.in_let_context = false;
2151                return TokenType::Be;
2152            }
2153            "while" => return TokenType::While,
2154            "assert" => return TokenType::Assert,
2155            "trust" => return TokenType::Trust,
2156            "check" => return TokenType::Check,
2157            // Theorem keywords (Declarative mode - for theorem blocks)
2158            "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
2159            "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
2160            "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
2161            // P2P Networking keywords (Imperative mode only)
2162            "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
2163            "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
2164            "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
2165            // GossipSub keywords (Imperative mode only)
2166            "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
2167            // Persistence keywords
2168            "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
2169            "persistent" => return TokenType::Persistent,  // Works in type expressions
2170            "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
2171            // Go-like Concurrency keywords (Imperative mode only)
2172            // Note: "first" and "after" are NOT keywords - they're checked via lookahead in parser
2173            // to avoid conflicting with their use as variable names
2174            "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
2175            "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
2176            "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
2177            "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
2178            "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
2179            "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
2180            "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2181            "native" => return TokenType::Native,
2182            "escape" if self.mode == LexerMode::Imperative => return TokenType::Escape,
2183            "from" => return TokenType::From,
2184            "otherwise" => return TokenType::Otherwise,
2185            // Phase 30c: Else/elif as aliases for Otherwise/Otherwise If
2186            "else" => return TokenType::Else,
2187            "elif" => return TokenType::Elif,
2188            // Sum type definition (Declarative mode only - for enum "either...or...")
2189            "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
2190            // Pattern matching statement
2191            "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
2192            // Constructor keyword (Imperative mode only)
2193            "new" if self.mode == LexerMode::Imperative => return TokenType::New,
2194            // Only emit Give/Show as keywords in Imperative mode
2195            // In Declarative mode, they fall through to lexicon lookup as verbs
2196            "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
2197            "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
2198            // Collection operation keywords (Imperative mode only)
2199            "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
2200            "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
2201            "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
2202            "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
2203            "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
2204            "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2205            // Set operation keywords (Imperative mode only)
2206            "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
2207            "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
2208            "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
2209            "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
2210            "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
2211            // Zone keywords (Imperative mode only)
2212            "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2213            "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
2214            "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
2215            "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
2216            "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
2217            // Structured Concurrency keywords (Imperative mode only)
2218            "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
2219            "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
2220            "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
2221            // IO keywords (Imperative mode only)
2222            "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
2223            "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
2224            "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
2225            "file" if self.mode == LexerMode::Imperative => return TokenType::File,
2226            // Agent System keywords (Imperative mode only)
2227            "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
2228            "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
2229            "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
2230            // Serialization keyword (works in Definition blocks too)
2231            "portable" => return TokenType::Portable,
2232            // Sipping Protocol keywords (Imperative mode only)
2233            "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
2234            "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
2235            // CRDT keywords
2236            "shared" => return TokenType::Shared,  // Works in Definition blocks like Portable
2237            "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
2238            "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
2239            // Extended CRDT keywords
2240            "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
2241            "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
2242            "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
2243            "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
2244            // Type keywords (work in both modes like "Shared"):
2245            "tally" => return TokenType::Tally,
2246            "sharedset" => return TokenType::SharedSet,
2247            "sharedsequence" => return TokenType::SharedSequence,
2248            "collaborativesequence" => return TokenType::CollaborativeSequence,
2249            "sharedmap" => return TokenType::SharedMap,
2250            "divergent" => return TokenType::Divergent,
2251            "removewins" => return TokenType::RemoveWins,
2252            "addwins" => return TokenType::AddWins,
2253            "yata" => return TokenType::YATA,
2254            // Calendar time unit words (Span expressions)
2255            "day" | "days" => return TokenType::CalendarUnit(CalendarUnit::Day),
2256            "week" | "weeks" => return TokenType::CalendarUnit(CalendarUnit::Week),
2257            "month" | "months" => return TokenType::CalendarUnit(CalendarUnit::Month),
2258            "year" | "years" => return TokenType::CalendarUnit(CalendarUnit::Year),
2259            // Span-related keywords (note: "before" is handled earlier to avoid preposition conflict)
2260            "ago" => return TokenType::Ago,
2261            "hence" => return TokenType::Hence,
2262            "if" => return TokenType::If,
2263            "only" => return TokenType::Focus(FocusKind::Only),
2264            "even" => return TokenType::Focus(FocusKind::Even),
2265            "just" if self.peek_word(1).map_or(false, |w| {
2266                !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
2267            }) => return TokenType::Focus(FocusKind::Just),
2268            "much" => return TokenType::Measure(MeasureKind::Much),
2269            "little" => return TokenType::Measure(MeasureKind::Little),
2270            _ => {}
2271        }
2272
2273        if lexicon::is_scopal_adverb(&lower) {
2274            let sym = self.interner.intern(&Self::capitalize(&lower));
2275            return TokenType::ScopalAdverb(sym);
2276        }
2277
2278        if lexicon::is_temporal_adverb(&lower) {
2279            let sym = self.interner.intern(&Self::capitalize(&lower));
2280            return TokenType::TemporalAdverb(sym);
2281        }
2282
2283        if lexicon::is_non_intersective(&lower) {
2284            let sym = self.interner.intern(&Self::capitalize(&lower));
2285            return TokenType::NonIntersectiveAdjective(sym);
2286        }
2287
2288        if lexicon::is_adverb(&lower) {
2289            let sym = self.interner.intern(&Self::capitalize(&lower));
2290            return TokenType::Adverb(sym);
2291        }
2292        if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
2293            let sym = self.interner.intern(&Self::capitalize(&lower));
2294            return TokenType::Adverb(sym);
2295        }
2296
2297        if let Some(base) = self.try_parse_superlative(&lower) {
2298            let sym = self.interner.intern(&base);
2299            return TokenType::Superlative(sym);
2300        }
2301
2302        // Handle irregular comparatives (less, more, better, worse)
2303        let irregular_comparative = match lower.as_str() {
2304            "less" => Some("Little"),
2305            "more" => Some("Much"),
2306            "better" => Some("Good"),
2307            "worse" => Some("Bad"),
2308            _ => None,
2309        };
2310        if let Some(base) = irregular_comparative {
2311            let sym = self.interner.intern(base);
2312            return TokenType::Comparative(sym);
2313        }
2314
2315        if let Some(base) = self.try_parse_comparative(&lower) {
2316            let sym = self.interner.intern(&base);
2317            return TokenType::Comparative(sym);
2318        }
2319
2320        if lexicon::is_performative(&lower) {
2321            // If the word is also a common noun AND follows a determiner or precedes a copula,
2322            // don't force performative reading.
2323            // "every request holds" → request is a noun, not a performative verb.
2324            // "If request is asserted" → request is a noun (subject before copula).
2325            // "I promise to come" → promise IS a performative verb.
2326            let after_determiner = self.prev_token_is_determiner();
2327            let before_copula = self.next_token_is_copula();
2328            if !lexicon::is_common_noun(&lower) || (!after_determiner && !before_copula) {
2329                let sym = self.interner.intern(&Self::capitalize(&lower));
2330                return TokenType::Performative(sym);
2331            }
2332            // Fall through to noun/verb disambiguation below
2333        }
2334
2335        if lexicon::is_base_verb_early(&lower) {
2336            // If the word is also a common noun AND follows a determiner or precedes a copula,
2337            // don't force verb reading.
2338            // "every grant holds" → grant is a noun, not a verb.
2339            // "If grant is low" → grant is a noun (subject before copula).
2340            let after_determiner = self.prev_token_is_determiner();
2341            let before_copula = self.next_token_is_copula();
2342            if !lexicon::is_common_noun(&lower) || (!after_determiner && !before_copula) {
2343                let sym = self.interner.intern(&Self::capitalize(&lower));
2344                let class = lexicon::lookup_verb_class(&lower);
2345                return TokenType::Verb {
2346                    lemma: sym,
2347                    time: Time::Present,
2348                    aspect: Aspect::Simple,
2349                    class,
2350                };
2351            }
2352            // Fall through to noun/verb disambiguation below
2353        }
2354
2355        // Check for gerunds/progressive verbs BEFORE ProperName check
2356        // "Running" at start of sentence should be Verb, not ProperName
2357        if lower.ends_with("ing") && lower.len() > 4 {
2358            if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2359                let sym = self.interner.intern(&entry.lemma);
2360                return TokenType::Verb {
2361                    lemma: sym,
2362                    time: entry.time,
2363                    aspect: entry.aspect,
2364                    class: entry.class,
2365                };
2366            }
2367        }
2368
2369        if first_char.is_uppercase() {
2370            // Smart Lexicon: Check if this capitalized word is actually a common noun
2371            // Only apply for sentence-initial words (followed by verb) to avoid
2372            // breaking type definitions like "A Point has:"
2373            //
2374            // Pattern: "Farmers walk." → Farmers is plural of Farmer (common noun)
2375            // Pattern: "A Point has:" → Point is a type name (proper name)
2376            if let Some(next) = self.peek_word(1) {
2377                let next_lower = next.to_lowercase();
2378                // If next word is a verb, this capitalized word is likely a subject noun
2379                let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
2380                    || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
2381
2382                if is_followed_by_verb {
2383                    // Check if lowercase version is a derivable common noun
2384                    if let Some(analysis) = lexicon::analyze_word(&lower) {
2385                        match analysis {
2386                            lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
2387                                // It's a plural noun - definitely a common noun
2388                                let sym = self.interner.intern(&lower);
2389                                return TokenType::Noun(sym);
2390                            }
2391                            lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
2392                                // Derived plural agentive noun (e.g., "Bloggers")
2393                                let sym = self.interner.intern(&lower);
2394                                return TokenType::Noun(sym);
2395                            }
2396                            _ => {
2397                                // Singular nouns at sentence start could still be proper names
2398                                // e.g., "John walks." vs "Farmer walks."
2399                            }
2400                        }
2401                    }
2402                }
2403            }
2404
2405            let sym = self.interner.intern(word);
2406            return TokenType::ProperName(sym);
2407        }
2408
2409        let verb_entry = self.lexicon.lookup_verb(&lower);
2410        let is_noun = lexicon::is_common_noun(&lower);
2411        let is_adj = self.is_adjective_like(&lower);
2412        let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
2413
2414        // Ambiguous: word is Verb AND (Noun OR Adjective), not disambiguated
2415        if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
2416            let entry = verb_entry.unwrap();
2417            let verb_token = TokenType::Verb {
2418                lemma: self.interner.intern(&entry.lemma),
2419                time: entry.time,
2420                aspect: entry.aspect,
2421                class: entry.class,
2422            };
2423
2424            let mut alternatives = Vec::new();
2425            if is_noun {
2426                alternatives.push(TokenType::Noun(self.interner.intern(word)));
2427            }
2428            if is_adj {
2429                alternatives.push(TokenType::Adjective(self.interner.intern(word)));
2430            }
2431
2432            return TokenType::Ambiguous {
2433                primary: Box::new(verb_token),
2434                alternatives,
2435            };
2436        }
2437
2438        // Disambiguated to noun/adjective (not verb)
2439        if let Some(_) = &verb_entry {
2440            if is_disambiguated {
2441                let sym = self.interner.intern(word);
2442                if is_noun {
2443                    return TokenType::Noun(sym);
2444                }
2445                return TokenType::Adjective(sym);
2446            }
2447        }
2448
2449        // Pure verb
2450        if let Some(entry) = verb_entry {
2451            let sym = self.interner.intern(&entry.lemma);
2452            return TokenType::Verb {
2453                lemma: sym,
2454                time: entry.time,
2455                aspect: entry.aspect,
2456                class: entry.class,
2457            };
2458        }
2459
2460        // Pure noun
2461        if is_noun {
2462            let sym = self.interner.intern(word);
2463            return TokenType::Noun(sym);
2464        }
2465
2466        if lexicon::is_base_verb(&lower) {
2467            let sym = self.interner.intern(&Self::capitalize(&lower));
2468            let class = lexicon::lookup_verb_class(&lower);
2469            return TokenType::Verb {
2470                lemma: sym,
2471                time: Time::Present,
2472                aspect: Aspect::Simple,
2473                class,
2474            };
2475        }
2476
2477        if lower.ends_with("ian")
2478            || lower.ends_with("er")
2479            || lower == "logic"
2480            || lower == "time"
2481            || lower == "men"
2482            || lower == "book"
2483            || lower == "house"
2484            || lower == "code"
2485            || lower == "user"
2486        {
2487            let sym = self.interner.intern(word);
2488            return TokenType::Noun(sym);
2489        }
2490
2491        if lexicon::is_particle(&lower) {
2492            let sym = self.interner.intern(&lower);
2493            return TokenType::Particle(sym);
2494        }
2495
2496        let sym = self.interner.intern(word);
2497        TokenType::Adjective(sym)
2498    }
2499
2500    fn capitalize(s: &str) -> String {
2501        let mut chars = s.chars();
2502        match chars.next() {
2503            None => String::new(),
2504            Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
2505        }
2506    }
2507
2508    pub fn is_collective_verb(lemma: &str) -> bool {
2509        lexicon::is_collective_verb(&lemma.to_lowercase())
2510    }
2511
2512    pub fn is_mixed_verb(lemma: &str) -> bool {
2513        lexicon::is_mixed_verb(&lemma.to_lowercase())
2514    }
2515
2516    pub fn is_distributive_verb(lemma: &str) -> bool {
2517        lexicon::is_distributive_verb(&lemma.to_lowercase())
2518    }
2519
2520    pub fn is_intensional_predicate(lemma: &str) -> bool {
2521        lexicon::is_intensional_predicate(&lemma.to_lowercase())
2522    }
2523
2524    pub fn is_opaque_verb(lemma: &str) -> bool {
2525        lexicon::is_opaque_verb(&lemma.to_lowercase())
2526    }
2527
2528    pub fn is_ditransitive_verb(lemma: &str) -> bool {
2529        lexicon::is_ditransitive_verb(&lemma.to_lowercase())
2530    }
2531
2532    fn is_verb_like(&self, word: &str) -> bool {
2533        let lower = word.to_lowercase();
2534        if lexicon::is_infinitive_verb(&lower) {
2535            return true;
2536        }
2537        if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2538            return entry.lemma.len() > 0;
2539        }
2540        false
2541    }
2542
2543    pub fn is_subject_control_verb(lemma: &str) -> bool {
2544        lexicon::is_subject_control_verb(&lemma.to_lowercase())
2545    }
2546
2547    pub fn is_raising_verb(lemma: &str) -> bool {
2548        lexicon::is_raising_verb(&lemma.to_lowercase())
2549    }
2550
2551    pub fn is_object_control_verb(lemma: &str) -> bool {
2552        lexicon::is_object_control_verb(&lemma.to_lowercase())
2553    }
2554
2555    pub fn is_weather_verb(lemma: &str) -> bool {
2556        matches!(
2557            lemma.to_lowercase().as_str(),
2558            "rain" | "snow" | "hail" | "thunder" | "pour"
2559        )
2560    }
2561
2562    fn try_parse_superlative(&self, word: &str) -> Option<String> {
2563        if !word.ends_with("est") || word.len() < 5 {
2564            return None;
2565        }
2566
2567        let base = &word[..word.len() - 3];
2568
2569        if base.len() >= 2 {
2570            let chars: Vec<char> = base.chars().collect();
2571            let last = chars[chars.len() - 1];
2572            let second_last = chars[chars.len() - 2];
2573            if last == second_last && !"aeiou".contains(last) {
2574                let stem = &base[..base.len() - 1];
2575                if lexicon::is_gradable_adjective(stem) {
2576                    return Some(Self::capitalize(stem));
2577                }
2578            }
2579        }
2580
2581        if base.ends_with("i") {
2582            let stem = format!("{}y", &base[..base.len() - 1]);
2583            if lexicon::is_gradable_adjective(&stem) {
2584                return Some(Self::capitalize(&stem));
2585            }
2586        }
2587
2588        if lexicon::is_gradable_adjective(base) {
2589            return Some(Self::capitalize(base));
2590        }
2591
2592        None
2593    }
2594
2595    fn try_parse_comparative(&self, word: &str) -> Option<String> {
2596        if !word.ends_with("er") || word.len() < 4 {
2597            return None;
2598        }
2599
2600        let base = &word[..word.len() - 2];
2601
2602        if base.len() >= 2 {
2603            let chars: Vec<char> = base.chars().collect();
2604            let last = chars[chars.len() - 1];
2605            let second_last = chars[chars.len() - 2];
2606            if last == second_last && !"aeiou".contains(last) {
2607                let stem = &base[..base.len() - 1];
2608                if lexicon::is_gradable_adjective(stem) {
2609                    return Some(Self::capitalize(stem));
2610                }
2611            }
2612        }
2613
2614        if base.ends_with("i") {
2615            let stem = format!("{}y", &base[..base.len() - 1]);
2616            if lexicon::is_gradable_adjective(&stem) {
2617                return Some(Self::capitalize(&stem));
2618            }
2619        }
2620
2621        if lexicon::is_gradable_adjective(base) {
2622            return Some(Self::capitalize(base));
2623        }
2624
2625        None
2626    }
2627}
2628
2629#[cfg(test)]
2630mod tests {
2631    use super::*;
2632
2633    #[test]
2634    fn lexer_handles_apostrophe() {
2635        let mut interner = Interner::new();
2636        let mut lexer = Lexer::new("it's raining", &mut interner);
2637        let tokens = lexer.tokenize();
2638        assert!(!tokens.is_empty());
2639    }
2640
2641    #[test]
2642    fn lexer_handles_question_mark() {
2643        let mut interner = Interner::new();
2644        let mut lexer = Lexer::new("Is it raining?", &mut interner);
2645        let tokens = lexer.tokenize();
2646        assert!(!tokens.is_empty());
2647    }
2648
2649    #[test]
2650    fn ring_is_not_verb() {
2651        let mut interner = Interner::new();
2652        let mut lexer = Lexer::new("ring", &mut interner);
2653        let tokens = lexer.tokenize();
2654        assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2655    }
2656
2657    #[test]
2658    fn debug_that_token() {
2659        let mut interner = Interner::new();
2660        let mut lexer = Lexer::new("The cat that runs", &mut interner);
2661        let tokens = lexer.tokenize();
2662        for (i, t) in tokens.iter().enumerate() {
2663            let lex = interner.resolve(t.lexeme);
2664            eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2665        }
2666        let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
2667        if let Some(t) = that_token {
2668            // Verify discriminant comparison works
2669            let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
2670            eprintln!("Discriminant check for That: {}", check);
2671            assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
2672        } else {
2673            panic!("No 'that' token found");
2674        }
2675    }
2676
2677    #[test]
2678    fn bus_is_not_verb() {
2679        let mut interner = Interner::new();
2680        let mut lexer = Lexer::new("bus", &mut interner);
2681        let tokens = lexer.tokenize();
2682        assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2683    }
2684
2685    #[test]
2686    fn lowercase_a_is_article() {
2687        let mut interner = Interner::new();
2688        let mut lexer = Lexer::new("a car", &mut interner);
2689        let tokens = lexer.tokenize();
2690        for (i, t) in tokens.iter().enumerate() {
2691            let lex = interner.resolve(t.lexeme);
2692            eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2693        }
2694        assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
2695        assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
2696    }
2697
2698    #[test]
2699    fn open_is_ambiguous() {
2700        let mut interner = Interner::new();
2701        let mut lexer = Lexer::new("open", &mut interner);
2702        let tokens = lexer.tokenize();
2703
2704        if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
2705            assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
2706            assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
2707                "Should have Adjective alternative");
2708        } else {
2709            panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2710        }
2711    }
2712
2713    #[test]
2714    fn basic_tokenization() {
2715        let mut interner = Interner::new();
2716        let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2717        let tokens = lexer.tokenize();
2718        assert_eq!(tokens[0].kind, TokenType::All);
2719        assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2720        assert_eq!(tokens[2].kind, TokenType::Are);
2721    }
2722
2723    #[test]
2724    fn iff_tokenizes_as_single_token() {
2725        let mut interner = Interner::new();
2726        let mut lexer = Lexer::new("A if and only if B", &mut interner);
2727        let tokens = lexer.tokenize();
2728        assert!(
2729            tokens.iter().any(|t| t.kind == TokenType::Iff),
2730            "should contain Iff token: got {:?}",
2731            tokens
2732        );
2733    }
2734
2735    #[test]
2736    fn is_equal_to_tokenizes_as_identity() {
2737        let mut interner = Interner::new();
2738        let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2739        let tokens = lexer.tokenize();
2740        assert!(
2741            tokens.iter().any(|t| t.kind == TokenType::Identity),
2742            "should contain Identity token: got {:?}",
2743            tokens
2744        );
2745    }
2746
2747    #[test]
2748    fn is_identical_to_tokenizes_as_identity() {
2749        let mut interner = Interner::new();
2750        let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2751        let tokens = lexer.tokenize();
2752        assert!(
2753            tokens.iter().any(|t| t.kind == TokenType::Identity),
2754            "should contain Identity token: got {:?}",
2755            tokens
2756        );
2757    }
2758
2759    #[test]
2760    fn itself_tokenizes_as_reflexive() {
2761        let mut interner = Interner::new();
2762        let mut lexer = Lexer::new("John loves itself", &mut interner);
2763        let tokens = lexer.tokenize();
2764        assert!(
2765            tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2766            "should contain Reflexive token: got {:?}",
2767            tokens
2768        );
2769    }
2770
2771    #[test]
2772    fn himself_tokenizes_as_reflexive() {
2773        let mut interner = Interner::new();
2774        let mut lexer = Lexer::new("John sees himself", &mut interner);
2775        let tokens = lexer.tokenize();
2776        assert!(
2777            tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2778            "should contain Reflexive token: got {:?}",
2779            tokens
2780        );
2781    }
2782
2783    #[test]
2784    fn to_stay_tokenizes_correctly() {
2785        let mut interner = Interner::new();
2786        let mut lexer = Lexer::new("to stay", &mut interner);
2787        let tokens = lexer.tokenize();
2788        assert!(
2789            tokens.iter().any(|t| t.kind == TokenType::To),
2790            "should contain To token: got {:?}",
2791            tokens
2792        );
2793        assert!(
2794            tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2795            "should contain Verb token for stay: got {:?}",
2796            tokens
2797        );
2798    }
2799
2800    #[test]
2801    fn possessive_apostrophe_s() {
2802        let mut interner = Interner::new();
2803        let mut lexer = Lexer::new("John's dog", &mut interner);
2804        let tokens = lexer.tokenize();
2805        assert!(
2806            tokens.iter().any(|t| t.kind == TokenType::Possessive),
2807            "should contain Possessive token: got {:?}",
2808            tokens
2809        );
2810        assert!(
2811            tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2812            "should have John as proper name: got {:?}",
2813            tokens
2814        );
2815    }
2816
2817    #[test]
2818    fn lexer_produces_valid_spans() {
2819        let input = "All men are mortal.";
2820        let mut interner = Interner::new();
2821        let mut lexer = Lexer::new(input, &mut interner);
2822        let tokens = lexer.tokenize();
2823
2824        // "All" at 0..3
2825        assert_eq!(tokens[0].span.start, 0);
2826        assert_eq!(tokens[0].span.end, 3);
2827        assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2828
2829        // "men" at 4..7
2830        assert_eq!(tokens[1].span.start, 4);
2831        assert_eq!(tokens[1].span.end, 7);
2832        assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2833
2834        // "are" at 8..11
2835        assert_eq!(tokens[2].span.start, 8);
2836        assert_eq!(tokens[2].span.end, 11);
2837        assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2838
2839        // "mortal" at 12..18
2840        assert_eq!(tokens[3].span.start, 12);
2841        assert_eq!(tokens[3].span.end, 18);
2842        assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2843
2844        // "." at 18..19
2845        assert_eq!(tokens[4].span.start, 18);
2846        assert_eq!(tokens[4].span.end, 19);
2847
2848        // EOF at end
2849        assert_eq!(tokens[5].span.start, input.len());
2850        assert_eq!(tokens[5].kind, TokenType::EOF);
2851    }
2852
2853    #[test]
2854    fn triple_quote_produces_string_token() {
2855        let mut interner = Interner::new();
2856        let source = "## Main\nLet msg be \"\"\"\n    Hello\n    World\n\"\"\".\nShow msg.";
2857        let mut lexer = Lexer::new(source, &mut interner);
2858        let tokens = lexer.tokenize();
2859        // Dump all tokens for debugging
2860        for (i, t) in tokens.iter().enumerate() {
2861            let lex = interner.resolve(t.lexeme);
2862            eprintln!("Token[{}]: {:?} lex={:?} span={}..{}", i, t.kind, lex, t.span.start, t.span.end);
2863        }
2864        // Find the string token
2865        let str_token = tokens.iter().find(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)));
2866        assert!(str_token.is_some(), "Should have a string token. Tokens: {:?}", tokens.iter().map(|t| format!("{:?}", t.kind)).collect::<Vec<_>>());
2867        if let Some(tok) = str_token {
2868            let content = interner.resolve(tok.lexeme);
2869            eprintln!("Triple-quote content: {:?}", content);
2870            assert!(content.contains("Hello"), "Should contain Hello, got: {:?}", content);
2871        }
2872    }
2873}