logicaffeine_language/
mwe.rs

1//! Multi-Word Expression (MWE) processing.
2//!
3//! Post-tokenization pipeline that collapses multi-token sequences
4//! into single semantic units (e.g., "fire engine" → FireEngine).
5//!
6//! # How It Works
7//!
8//! The MWE pipeline runs between lexing and parsing:
9//!
10//! 1. Build a trie from known multi-word expressions
11//! 2. Scan the token stream for matches using [`apply_mwe_pipeline`]
12//! 3. Replace matched sequences with single tokens
13//!
14//! # Supported MWE Types
15//!
16//! - **Compound nouns**: "fire engine", "ice cream"
17//! - **Phrasal verbs**: "look up", "give in"
18//! - **Fixed phrases**: "in order to", "as well as"
19//!
20//! # Key Functions
21//!
22//! - [`build_mwe_trie`]: Construct the MWE lookup trie
23//! - [`apply_mwe_pipeline`]: Transform token stream by collapsing MWEs
24
25use std::collections::HashMap;
26use crate::token::{Token, TokenType};
27use crate::lexicon::{VerbClass, Time, Aspect};
28use logicaffeine_base::Interner;
29
30#[derive(Debug, Clone)]
31pub struct MweTarget {
32    pub lemma: &'static str,
33    pub pos: &'static str,
34    pub class: Option<VerbClass>,
35}
36
37#[derive(Default, Debug)]
38pub struct MweTrie {
39    pub children: HashMap<String, MweTrie>,
40    pub target: Option<MweTarget>,
41}
42
43impl MweTrie {
44    pub fn insert(&mut self, pattern: &[&str], target: MweTarget) {
45        if pattern.is_empty() {
46            self.target = Some(target);
47            return;
48        }
49        self.children
50            .entry(pattern[0].to_lowercase())
51            .or_default()
52            .insert(&pattern[1..], target);
53    }
54}
55
56/// Apply MWE collapsing to a token stream.
57/// Matches on lemmas (not raw strings) to handle morphological variants.
58pub fn apply_mwe_pipeline(
59    tokens: Vec<Token>,
60    trie: &MweTrie,
61    interner: &mut Interner,
62) -> Vec<Token> {
63    let mut result = Vec::new();
64    let mut i = 0;
65
66    while i < tokens.len() {
67        if let Some((match_len, target)) = find_longest_match(&tokens[i..], trie, interner) {
68            let merged = create_merged_token(&tokens[i], target, interner);
69            result.push(merged);
70            i += match_len;
71        } else {
72            result.push(tokens[i].clone());
73            i += 1;
74        }
75    }
76    result
77}
78
79/// Extract lemma from a token for MWE matching.
80/// Uses lowercase for case-insensitive matching.
81fn get_lemma(token: &Token, interner: &Interner) -> String {
82    match &token.kind {
83        TokenType::Verb { lemma, .. } => interner.resolve(*lemma).to_lowercase(),
84        TokenType::Noun(sym) => interner.resolve(*sym).to_lowercase(),
85        TokenType::Adjective(sym) => interner.resolve(*sym).to_lowercase(),
86        TokenType::NonIntersectiveAdjective(sym) => interner.resolve(*sym).to_lowercase(),
87        TokenType::Preposition(sym) => interner.resolve(*sym).to_lowercase(),
88        TokenType::Particle(sym) => interner.resolve(*sym).to_lowercase(),
89        TokenType::Article(_) => interner.resolve(token.lexeme).to_lowercase(),
90        _ => interner.resolve(token.lexeme).to_lowercase(),
91    }
92}
93
94/// Find the longest MWE match starting at the beginning of the token slice.
95fn find_longest_match<'a>(
96    tokens: &[Token],
97    trie: &'a MweTrie,
98    interner: &Interner,
99) -> Option<(usize, &'a MweTarget)> {
100    let mut node = trie;
101    let mut best: Option<(usize, &MweTarget)> = None;
102
103    for (i, token) in tokens.iter().enumerate() {
104        let lemma = get_lemma(token, interner);
105        if let Some(child) = node.children.get(&lemma) {
106            node = child;
107            if let Some(target) = &node.target {
108                best = Some((i + 1, target));
109            }
110        } else {
111            break;
112        }
113    }
114    best
115}
116
117/// Create a merged token from the MWE target, inheriting tense from the head token.
118fn create_merged_token(head: &Token, target: &MweTarget, interner: &mut Interner) -> Token {
119    let lemma_sym = interner.intern(target.lemma);
120
121    let kind = match target.pos {
122        "Noun" => TokenType::Noun(lemma_sym),
123        "Verb" => {
124            let (time, aspect) = match &head.kind {
125                TokenType::Verb { time, aspect, .. } => (*time, *aspect),
126                _ => (Time::Present, Aspect::Simple),
127            };
128            TokenType::Verb {
129                lemma: lemma_sym,
130                time,
131                aspect,
132                class: target.class.unwrap_or(VerbClass::Activity),
133            }
134        }
135        "Preposition" => TokenType::Preposition(lemma_sym),
136        "Conjunction" => TokenType::And,
137        "Quantifier" => TokenType::NoOne,
138        _ => TokenType::Noun(lemma_sym),
139    };
140
141    Token {
142        kind,
143        lexeme: lemma_sym,
144        span: head.span,
145    }
146}
147
148include!(concat!(env!("OUT_DIR"), "/mwe_data.rs"));