Skip to main content

cqlsh_rs/
cql_lexer.rs

1//! Unified CQL lexer (tokenizer) with grammar-aware position tracking.
2//!
3//! Provides a single shared tokenizer that powers syntax highlighting (colorizer),
4//! tab completion (completer), and statement parsing (parser). Replaces three
5//! ad-hoc implementations with one consistent CQL understanding.
6//!
7//! Design: hand-written state machine, O(n) single pass, no dependencies.
8//! See `docs/plans/18-cql-lexer.md` for motivation and design decisions.
9
10/// A token produced by the CQL lexer.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct Token {
13    /// The kind of token.
14    pub kind: TokenKind,
15    /// The raw text of the token as it appears in the input.
16    pub text: String,
17    /// Byte offset where this token starts in the input.
18    pub start: usize,
19    /// Byte offset where this token ends (exclusive) in the input.
20    pub end: usize,
21}
22
23/// Classification of a CQL token.
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum TokenKind {
26    /// A CQL keyword (SELECT, FROM, WHERE, etc.). Determined by context — words
27    /// in identifier position (after FROM, after dot, etc.) are `Identifier` instead.
28    Keyword,
29    /// An unquoted identifier (table name, column name, keyspace name, etc.).
30    Identifier,
31    /// A double-quoted identifier (`"MyTable"`).
32    QuotedIdentifier,
33    /// A single-quoted string literal (`'hello'`).
34    StringLiteral,
35    /// A dollar-quoted string literal (`$$body$$`).
36    DollarStringLiteral,
37    /// A numeric literal (integer or decimal: `42`, `3.14`, `-1`).
38    NumberLiteral,
39    /// A blob literal (`0xDEADBEEF`).
40    BlobLiteral,
41    /// A UUID literal (`550e8400-e29b-41d4-a716-446655440000`).
42    UuidLiteral,
43    /// A boolean literal (`true`, `false`).
44    BooleanLiteral,
45    /// An operator (`=`, `<`, `>`, `<=`, `>=`, `!=`, `+`, `-`, etc.).
46    Operator,
47    /// Punctuation (`;`, `,`, `(`, `)`, `.`, `*`, `?`).
48    Punctuation,
49    /// Whitespace (spaces, tabs, newlines).
50    Whitespace,
51    /// A line comment (`-- ...`).
52    LineComment,
53    /// A block comment (`/* ... */`), possibly nested.
54    BlockComment,
55    /// Unrecognized character.
56    Unknown,
57}
58
59/// Grammar context: what syntactic position we're at, used to distinguish
60/// keywords from identifiers and to drive tab completion.
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum GrammarContext {
63    /// Start of statement — expecting a keyword like SELECT, INSERT, etc.
64    Start,
65    /// After SELECT — expecting column list, *, or DISTINCT.
66    ExpectColumnList,
67    /// After FROM / INTO / UPDATE / TABLE / INDEX ON — expecting a table name.
68    ExpectTable,
69    /// After USE / KEYSPACE — expecting a keyspace name.
70    ExpectKeyspace,
71    /// After WHERE / AND / IF (conditions) — expecting a column name.
72    ExpectColumn,
73    /// After SET (in UPDATE) — expecting column = value pairs.
74    ExpectSetClause,
75    /// After a dot — expecting the second part of a qualified name.
76    ExpectQualifiedPart,
77    /// After a column/table type keyword — expecting a CQL type name.
78    ExpectType,
79    /// After ORDER — expecting BY.
80    ExpectOrderBy,
81    /// After ORDER BY — expecting column name.
82    ExpectOrderByColumn,
83    /// After VALUES — expecting ( value_list ).
84    ExpectValues,
85    /// Inside WITH clause options.
86    ExpectWithOption,
87    /// After CONSISTENCY / SERIAL CONSISTENCY — expecting level name.
88    ExpectConsistencyLevel,
89    /// After DESCRIBE / DESC — expecting sub-command or schema name.
90    ExpectDescribeTarget,
91    /// After SOURCE / CAPTURE — expecting file path.
92    ExpectFilePath,
93    /// After CREATE — expecting TABLE, KEYSPACE, INDEX, etc.
94    ExpectCreateTarget,
95    /// After ALTER — expecting TABLE, KEYSPACE, TYPE, etc.
96    ExpectAlterTarget,
97    /// After DROP — expecting TABLE, KEYSPACE, INDEX, etc.
98    ExpectDropTarget,
99    /// After DELETE (before FROM) — expecting column names or FROM.
100    ExpectDeleteTarget,
101    /// After GRANT or REVOKE — expecting permission names.
102    ExpectGrantRevoke,
103    /// After INSERT — expecting INTO.
104    ExpectInsertTarget,
105    /// After BEGIN — expecting BATCH, UNLOGGED, COUNTER.
106    ExpectBeginTarget,
107    /// After SELECT … FROM <table> — expecting WHERE, ORDER BY, LIMIT, etc.
108    ExpectSelectPostFrom,
109    /// After INSERT INTO … VALUES (…) — expecting IF NOT EXISTS, USING.
110    ExpectInsertPostValues,
111    /// After DELETE … FROM <table> — expecting WHERE, USING TIMESTAMP, IF.
112    ExpectDeletePostFrom,
113    /// After UPDATE <table> — expecting SET, USING.
114    ExpectUpdateClause,
115    /// After UPDATE … SET … — expecting WHERE, IF.
116    ExpectUpdatePostSet,
117    /// General clause context (default within a statement body).
118    General,
119}
120
121/// Set of CQL keywords and shell commands (uppercase, sorted for binary search).
122const CQL_KEYWORDS: &[&str] = &[
123    "ADD",
124    "AGGREGATE",
125    "ALL",
126    "ALLOW",
127    "ALTER",
128    "AND",
129    "APPLY",
130    "AS",
131    "ASC",
132    "AUTHORIZE",
133    "BATCH",
134    "BEGIN",
135    "BY",
136    "CALLED",
137    "CAPTURE",
138    "CLEAR",
139    "CLS",
140    "CLUSTERING",
141    "COLUMN",
142    "COMPACT",
143    "CONSISTENCY",
144    "CONTAINS",
145    "COPY",
146    "COUNT",
147    "COUNTER",
148    "CREATE",
149    "CUSTOM",
150    "DELETE",
151    "DESC",
152    "DESCRIBE",
153    "DISTINCT",
154    "DROP",
155    "EACH_QUORUM",
156    "ENTRIES",
157    "EXECUTE",
158    "EXISTS",
159    "EXIT",
160    "EXPAND",
161    "FILTERING",
162    "FINALFUNC",
163    "FROM",
164    "FROZEN",
165    "FULL",
166    "FUNCTION",
167    "FUNCTIONS",
168    "GRANT",
169    "HELP",
170    "IF",
171    "IN",
172    "INDEX",
173    "INITCOND",
174    "INPUT",
175    "INSERT",
176    "INTO",
177    "IS",
178    "JSON",
179    "KEY",
180    "KEYSPACE",
181    "KEYSPACES",
182    "LANGUAGE",
183    "LIKE",
184    "LIMIT",
185    "LIST",
186    "LOCAL_ONE",
187    "LOCAL_QUORUM",
188    "LOGIN",
189    "MAP",
190    "MATERIALIZED",
191    "MODIFY",
192    "NAMESPACE",
193    "NORECURSIVE",
194    "NOT",
195    "NULL",
196    "OF",
197    "ON",
198    "ONE",
199    "OR",
200    "ORDER",
201    "PAGING",
202    "PARTITION",
203    "PASSWORD",
204    "PER",
205    "PERMISSION",
206    "PERMISSIONS",
207    "PRIMARY",
208    "QUIT",
209    "QUORUM",
210    "RENAME",
211    "REPLACE",
212    "RETURNS",
213    "REVOKE",
214    "SCHEMA",
215    "SELECT",
216    "SERIAL",
217    "SET",
218    "SFUNC",
219    "SHOW",
220    "SOURCE",
221    "STATIC",
222    "STORAGE",
223    "STYPE",
224    "SUPERUSER",
225    "TABLE",
226    "TABLES",
227    "TEXT",
228    "THREE",
229    "TIMESTAMP",
230    "TO",
231    "TOKEN",
232    "TRACING",
233    "TRIGGER",
234    "TRUNCATE",
235    "TTL",
236    "TUPLE",
237    "TWO",
238    "TYPE",
239    "UNICODE",
240    "UNLOGGED",
241    "UPDATE",
242    "USE",
243    "USER",
244    "USERS",
245    "USING",
246    "VALUES",
247    "VIEW",
248    "WHERE",
249    "WITH",
250    "WRITETIME",
251];
252
253/// Check if a word is a CQL keyword (case-insensitive).
254pub fn is_cql_keyword(word: &str) -> bool {
255    let upper = word.to_uppercase();
256    CQL_KEYWORDS.binary_search(&upper.as_str()).is_ok()
257}
258
259/// Tokenize a CQL input string into a sequence of tokens with grammar context.
260///
261/// This is the main entry point. It performs a single O(n) pass and classifies
262/// each token using both lexical rules and grammar position tracking.
263pub fn tokenize(input: &str) -> Vec<Token> {
264    let mut tokens = Vec::new();
265    let mut ctx = GrammarContext::Start;
266    let bytes = input.as_bytes();
267    let len = bytes.len();
268    let mut i = 0;
269
270    while i < len {
271        let ch = bytes[i];
272
273        // Whitespace
274        if ch.is_ascii_whitespace() {
275            let start = i;
276            while i < len && bytes[i].is_ascii_whitespace() {
277                i += 1;
278            }
279            tokens.push(Token {
280                kind: TokenKind::Whitespace,
281                text: input[start..i].to_string(),
282                start,
283                end: i,
284            });
285            continue;
286        }
287
288        // Line comment: --
289        if ch == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
290            let start = i;
291            i += 2;
292            while i < len && bytes[i] != b'\n' {
293                i += 1;
294            }
295            tokens.push(Token {
296                kind: TokenKind::LineComment,
297                text: input[start..i].to_string(),
298                start,
299                end: i,
300            });
301            continue;
302        }
303
304        // Block comment: /* ... */ (nested)
305        if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
306            let start = i;
307            let mut depth: usize = 1;
308            i += 2;
309            while i < len && depth > 0 {
310                if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
311                    depth += 1;
312                    i += 2;
313                } else if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
314                    depth -= 1;
315                    i += 2;
316                } else {
317                    i += 1;
318                }
319            }
320            tokens.push(Token {
321                kind: TokenKind::BlockComment,
322                text: input[start..i].to_string(),
323                start,
324                end: i,
325            });
326            continue;
327        }
328
329        // Single-quoted string literal
330        if ch == b'\'' {
331            let start = i;
332            i += 1;
333            loop {
334                if i >= len {
335                    break; // unterminated
336                }
337                if bytes[i] == b'\'' {
338                    i += 1;
339                    // Escaped quote '' — continue string
340                    if i < len && bytes[i] == b'\'' {
341                        i += 1;
342                        continue;
343                    }
344                    break;
345                }
346                // Advance by UTF-8 char length
347                i += char_len_at(bytes, i);
348            }
349            tokens.push(Token {
350                kind: TokenKind::StringLiteral,
351                text: input[start..i].to_string(),
352                start,
353                end: i,
354            });
355            ctx = advance_context_after_value(ctx);
356            continue;
357        }
358
359        // Double-quoted identifier
360        if ch == b'"' {
361            let start = i;
362            i += 1;
363            loop {
364                if i >= len {
365                    break; // unterminated
366                }
367                if bytes[i] == b'"' {
368                    i += 1;
369                    // Escaped quote "" — continue
370                    if i < len && bytes[i] == b'"' {
371                        i += 1;
372                        continue;
373                    }
374                    break;
375                }
376                i += char_len_at(bytes, i);
377            }
378            tokens.push(Token {
379                kind: TokenKind::QuotedIdentifier,
380                text: input[start..i].to_string(),
381                start,
382                end: i,
383            });
384            ctx = advance_context_after_name(ctx);
385            continue;
386        }
387
388        // Dollar-quoted string: $$...$$
389        if ch == b'$' && i + 1 < len && bytes[i + 1] == b'$' {
390            let start = i;
391            i += 2;
392            loop {
393                if i + 1 >= len {
394                    i = len;
395                    break; // unterminated
396                }
397                if bytes[i] == b'$' && bytes[i + 1] == b'$' {
398                    i += 2;
399                    break;
400                }
401                i += 1;
402            }
403            tokens.push(Token {
404                kind: TokenKind::DollarStringLiteral,
405                text: input[start..i].to_string(),
406                start,
407                end: i,
408            });
409            ctx = advance_context_after_value(ctx);
410            continue;
411        }
412
413        // Blob literal: 0x followed by hex digits
414        if ch == b'0' && i + 1 < len && (bytes[i + 1] == b'x' || bytes[i + 1] == b'X') {
415            let start = i;
416            i += 2;
417            while i < len && bytes[i].is_ascii_hexdigit() {
418                i += 1;
419            }
420            // Make sure it's not followed by an identifier char (would be an identifier)
421            if i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
422                // Actually an identifier starting with 0x... — backtrack
423                i = start;
424            } else {
425                tokens.push(Token {
426                    kind: TokenKind::BlobLiteral,
427                    text: input[start..i].to_string(),
428                    start,
429                    end: i,
430                });
431                ctx = advance_context_after_value(ctx);
432                continue;
433            }
434        }
435
436        // Number literal: digits, optional dot, optional exponent
437        // Also handles negative numbers when preceded by operator context
438        if ch.is_ascii_digit()
439            || (ch == b'-'
440                && i + 1 < len
441                && bytes[i + 1].is_ascii_digit()
442                && is_number_sign_position(&tokens))
443        {
444            let start = i;
445            if ch == b'-' {
446                i += 1;
447            }
448            while i < len && bytes[i].is_ascii_digit() {
449                i += 1;
450            }
451            // Decimal part
452            if i < len && bytes[i] == b'.' && i + 1 < len && bytes[i + 1].is_ascii_digit() {
453                i += 1;
454                while i < len && bytes[i].is_ascii_digit() {
455                    i += 1;
456                }
457            }
458            // Exponent
459            if i < len && (bytes[i] == b'e' || bytes[i] == b'E') {
460                let save = i;
461                i += 1;
462                if i < len && (bytes[i] == b'+' || bytes[i] == b'-') {
463                    i += 1;
464                }
465                if i < len && bytes[i].is_ascii_digit() {
466                    while i < len && bytes[i].is_ascii_digit() {
467                        i += 1;
468                    }
469                } else {
470                    i = save; // not an exponent, backtrack
471                }
472            }
473            // UUID check: number followed by '-' hex pattern (8-4-4-4-12)
474            if i < len && bytes[i] == b'-' && looks_like_uuid(input, start, i) {
475                // Parse the full UUID
476                let uuid_end = scan_uuid(input, start);
477                if uuid_end > i {
478                    i = uuid_end;
479                    tokens.push(Token {
480                        kind: TokenKind::UuidLiteral,
481                        text: input[start..i].to_string(),
482                        start,
483                        end: i,
484                    });
485                    ctx = advance_context_after_value(ctx);
486                    continue;
487                }
488            }
489            // Make sure this isn't part of an identifier (like table1)
490            if i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
491                // It's an identifier starting with digits — shouldn't happen in valid CQL,
492                // but treat as identifier
493                while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
494                    i += 1;
495                }
496                let word = &input[start..i];
497                tokens.push(Token {
498                    kind: classify_word(word, ctx),
499                    text: word.to_string(),
500                    start,
501                    end: i,
502                });
503                ctx = advance_context_after_word(word, ctx);
504                continue;
505            }
506            tokens.push(Token {
507                kind: TokenKind::NumberLiteral,
508                text: input[start..i].to_string(),
509                start,
510                end: i,
511            });
512            ctx = advance_context_after_value(ctx);
513            continue;
514        }
515
516        // Word: identifier or keyword
517        if ch.is_ascii_alphabetic() || ch == b'_' {
518            let start = i;
519            while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
520                i += 1;
521            }
522            let word = &input[start..i];
523
524            // UUID check: word of hex chars followed by '-' hex pattern
525            if i < len
526                && bytes[i] == b'-'
527                && word.len() == 8
528                && word.chars().all(|c| c.is_ascii_hexdigit())
529            {
530                let uuid_end = scan_uuid(input, start);
531                if uuid_end > i {
532                    i = uuid_end;
533                    tokens.push(Token {
534                        kind: TokenKind::UuidLiteral,
535                        text: input[start..i].to_string(),
536                        start,
537                        end: i,
538                    });
539                    ctx = advance_context_after_value(ctx);
540                    continue;
541                }
542            }
543
544            let kind = classify_word(word, ctx);
545            tokens.push(Token {
546                kind,
547                text: word.to_string(),
548                start,
549                end: i,
550            });
551            ctx = advance_context_after_word(word, ctx);
552            continue;
553        }
554
555        // Operators
556        if is_operator_char(ch) {
557            let start = i;
558            // Two-char operators: <=, >=, !=
559            if i + 1 < len && is_two_char_operator(ch, bytes[i + 1]) {
560                i += 2;
561            } else {
562                i += 1;
563            }
564            tokens.push(Token {
565                kind: TokenKind::Operator,
566                text: input[start..i].to_string(),
567                start,
568                end: i,
569            });
570            continue;
571        }
572
573        // Punctuation
574        if is_punctuation(ch) {
575            let start = i;
576            i += 1;
577            let text = input[start..i].to_string();
578
579            // Track dot for qualified names
580            if ch == b'.' {
581                ctx = GrammarContext::ExpectQualifiedPart;
582            }
583
584            tokens.push(Token {
585                kind: TokenKind::Punctuation,
586                text,
587                start,
588                end: i,
589            });
590            continue;
591        }
592
593        // Unknown character (advance by UTF-8 char length)
594        let start = i;
595        let clen = char_len_at(bytes, i);
596        i += clen;
597        tokens.push(Token {
598            kind: TokenKind::Unknown,
599            text: input[start..i].to_string(),
600            start,
601            end: i,
602        });
603    }
604
605    tokens
606}
607
608/// Get the grammar context at the end of the given input.
609/// Useful for tab completion to know what kind of token is expected next.
610pub fn grammar_context_at_end(input: &str) -> GrammarContext {
611    let tokens = tokenize(input);
612    context_from_tokens(&tokens, input.len())
613}
614
615/// Derive grammar context from a token sequence (skipping whitespace/comments).
616/// `input_len` is the total length of the original input, used to detect trailing whitespace.
617pub fn context_from_tokens(tokens: &[Token], input_len: usize) -> GrammarContext {
618    // Walk backwards through significant tokens to determine context
619    let significant: Vec<&Token> = tokens
620        .iter()
621        .filter(|t| {
622            !matches!(
623                t.kind,
624                TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment
625            )
626        })
627        .collect();
628
629    if significant.is_empty() {
630        return GrammarContext::Start;
631    }
632
633    let last = significant.last().unwrap();
634
635    // Check last keyword for context
636    if last.kind == TokenKind::Keyword || last.kind == TokenKind::Identifier {
637        let upper = last.text.to_uppercase();
638        match upper.as_str() {
639            "SELECT" | "DISTINCT" => return GrammarContext::ExpectColumnList,
640            "FROM" | "UPDATE" | "TABLE" | "TRUNCATE" => return GrammarContext::ExpectTable,
641            "INTO" => {
642                if has_keyword_before(&significant, &["INSERT"]) {
643                    return GrammarContext::ExpectTable;
644                }
645                return GrammarContext::ExpectTable;
646            }
647            "CREATE" => return GrammarContext::ExpectCreateTarget,
648            "ALTER" => return GrammarContext::ExpectAlterTarget,
649            "DROP" => return GrammarContext::ExpectDropTarget,
650            "DELETE" => return GrammarContext::ExpectDeleteTarget,
651            "GRANT" | "REVOKE" => return GrammarContext::ExpectGrantRevoke,
652            "INSERT" => return GrammarContext::ExpectInsertTarget,
653            "BEGIN" => return GrammarContext::ExpectBeginTarget,
654            "USE" | "KEYSPACE" | "KEYSPACES" => return GrammarContext::ExpectKeyspace,
655            "WHERE" | "IF" => return GrammarContext::ExpectColumn,
656            "AND" => {
657                // AND after WHERE/IF is column context
658                if has_keyword_before(&significant, &["WHERE", "IF"]) {
659                    return GrammarContext::ExpectColumn;
660                }
661                return GrammarContext::General;
662            }
663            "SET" => {
664                // SET after UPDATE is column assignment; SET as a type is different
665                if has_keyword_before(&significant, &["UPDATE"]) {
666                    return GrammarContext::ExpectSetClause;
667                }
668                return GrammarContext::General;
669            }
670            "ORDER" => return GrammarContext::ExpectOrderBy,
671            "BY" => {
672                if significant.len() >= 2
673                    && significant[significant.len() - 2].text.to_uppercase() == "ORDER"
674                {
675                    return GrammarContext::ExpectOrderByColumn;
676                }
677                return GrammarContext::General;
678            }
679            "VALUES" => return GrammarContext::ExpectValues,
680            "WITH" => return GrammarContext::ExpectWithOption,
681            "CONSISTENCY" => return GrammarContext::ExpectConsistencyLevel,
682            "DESCRIBE" | "DESC" => return GrammarContext::ExpectDescribeTarget,
683            "SOURCE" | "CAPTURE" => return GrammarContext::ExpectFilePath,
684            "ON" => {
685                // INDEX ... ON -> expect table
686                if has_keyword_before(&significant, &["INDEX"]) {
687                    return GrammarContext::ExpectTable;
688                }
689                return GrammarContext::General;
690            }
691            "INDEX" => {
692                // After CREATE/DROP INDEX -> expect index name (identifier)
693                return GrammarContext::General;
694            }
695            _ => {}
696        }
697    }
698
699    if last.kind == TokenKind::Punctuation && last.text == "." {
700        return GrammarContext::ExpectQualifiedPart;
701    }
702
703    // After SELECT * → still in column list (expecting FROM or more columns)
704    if last.kind == TokenKind::Punctuation
705        && last.text == "*"
706        && significant.len() >= 2
707        && significant[significant.len() - 2].text.to_uppercase() == "SELECT"
708    {
709        return GrammarContext::ExpectColumnList;
710    }
711
712    // If last token is an identifier and second-to-last is a dot, we're still
713    // in qualified-name context — but only if the last token touches end of input
714    // (no trailing whitespace). "system.c" → qualified part; "system.table " → post-table.
715    if significant.len() >= 2 {
716        let last_token_end = last.start + last.text.len();
717        let at_end_of_input = last_token_end >= input_len;
718        let second_last = significant[significant.len() - 2];
719        if at_end_of_input && second_last.kind == TokenKind::Punctuation && second_last.text == "."
720        {
721            return GrammarContext::ExpectQualifiedPart;
722        }
723    }
724
725    // After FROM/INTO/UPDATE/TABLE/TRUNCATE + table_name — determine post-table context
726    // Handles both unqualified (KEYWORD table) and qualified (KEYWORD ks.table) names.
727    if significant.len() >= 2 {
728        let keyword_before_table = find_keyword_before_table(&significant);
729        if let Some(kw_upper) = keyword_before_table {
730            match kw_upper.as_str() {
731                "FROM" => {
732                    if has_keyword_before(&significant, &["SELECT"]) {
733                        return GrammarContext::ExpectSelectPostFrom;
734                    }
735                    if has_keyword_before(&significant, &["DELETE"]) {
736                        return GrammarContext::ExpectDeletePostFrom;
737                    }
738                    return GrammarContext::General;
739                }
740                "INTO" => {
741                    return GrammarContext::General;
742                }
743                "UPDATE" => {
744                    return GrammarContext::ExpectUpdateClause;
745                }
746                "TABLE" | "TRUNCATE" => {
747                    return GrammarContext::General;
748                }
749                _ => {}
750            }
751        }
752        // After SERIAL -> if next is CONSISTENCY
753        let second_last = significant[significant.len() - 2];
754        let sl_upper = second_last.text.to_uppercase();
755        if sl_upper == "SERIAL" && last.text.to_uppercase() == "CONSISTENCY" {
756            return GrammarContext::ExpectConsistencyLevel;
757        }
758    }
759
760    // Statement-aware fallthrough: detect post-clause contexts
761    if has_keyword_before(&significant, &["UPDATE"]) && has_keyword_before(&significant, &["SET"]) {
762        return GrammarContext::ExpectUpdatePostSet;
763    }
764    if has_keyword_before(&significant, &["INSERT"])
765        && has_keyword_before(&significant, &["VALUES"])
766    {
767        return GrammarContext::ExpectInsertPostValues;
768    }
769    if has_keyword_before(&significant, &["SELECT"]) && has_keyword_before(&significant, &["FROM"])
770    {
771        return GrammarContext::ExpectSelectPostFrom;
772    }
773    if has_keyword_before(&significant, &["DELETE"]) && has_keyword_before(&significant, &["FROM"])
774    {
775        return GrammarContext::ExpectDeletePostFrom;
776    }
777
778    GrammarContext::General
779}
780
781/// Contexts where the next word is always an identifier (a name), regardless
782/// of whether it matches a keyword. E.g., after FROM the next word is a table name.
783fn is_strict_identifier_context(ctx: GrammarContext) -> bool {
784    matches!(
785        ctx,
786        GrammarContext::ExpectTable
787            | GrammarContext::ExpectKeyspace
788            | GrammarContext::ExpectColumn
789            | GrammarContext::ExpectQualifiedPart
790            | GrammarContext::ExpectOrderByColumn
791            | GrammarContext::ExpectSetClause
792            | GrammarContext::ExpectConsistencyLevel
793    )
794}
795
796/// Keywords that remain keywords inside a SELECT column list.
797/// These are clause-level keywords that terminate or modify the column list.
798const COLUMN_LIST_KEYWORDS: &[&str] = &["AS", "DISTINCT", "FROM", "JSON"];
799
800fn is_column_list_keyword(word: &str) -> bool {
801    let upper = word.to_uppercase();
802    COLUMN_LIST_KEYWORDS.contains(&upper.as_str())
803}
804
805/// Classify a word as keyword, boolean, or identifier based on grammar context.
806fn classify_word(word: &str, ctx: GrammarContext) -> TokenKind {
807    let upper = word.to_uppercase();
808
809    // Boolean literals
810    if upper == "TRUE" || upper == "FALSE" {
811        return TokenKind::BooleanLiteral;
812    }
813
814    // In strict identifier contexts, EVERYTHING is an identifier
815    if is_strict_identifier_context(ctx) {
816        return TokenKind::Identifier;
817    }
818
819    // In column list context, only specific keywords remain keywords
820    if ctx == GrammarContext::ExpectColumnList {
821        if is_column_list_keyword(word) {
822            return TokenKind::Keyword;
823        }
824        return TokenKind::Identifier;
825    }
826
827    // NULL is a keyword-like value
828    if upper == "NULL" {
829        return TokenKind::Keyword;
830    }
831
832    if is_cql_keyword(&upper) {
833        TokenKind::Keyword
834    } else {
835        TokenKind::Identifier
836    }
837}
838
839/// Advance the grammar context after seeing a word token.
840fn advance_context_after_word(word: &str, ctx: GrammarContext) -> GrammarContext {
841    let upper = word.to_uppercase();
842
843    match upper.as_str() {
844        "SELECT" => GrammarContext::ExpectColumnList,
845        "DISTINCT" if ctx == GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
846        "FROM" => GrammarContext::ExpectTable,
847        "INTO" => GrammarContext::ExpectTable,
848        "UPDATE" => GrammarContext::ExpectTable,
849        "TABLE" => GrammarContext::ExpectTable,
850        "TRUNCATE" => GrammarContext::ExpectTable,
851        "USE" => GrammarContext::ExpectKeyspace,
852        "KEYSPACE" => GrammarContext::ExpectKeyspace,
853        "WHERE" => GrammarContext::ExpectColumn,
854        "AND" => {
855            // Preserve column context if we're in a WHERE/SET clause
856            match ctx {
857                GrammarContext::ExpectColumn | GrammarContext::General => {
858                    // After WHERE col = val AND -> next column
859                    GrammarContext::ExpectColumn
860                }
861                _ => GrammarContext::General,
862            }
863        }
864        "SET" => {
865            // After UPDATE table SET -> column assignment
866            match ctx {
867                GrammarContext::General => GrammarContext::ExpectSetClause,
868                _ => GrammarContext::General,
869            }
870        }
871        "ORDER" => GrammarContext::ExpectOrderBy,
872        "BY" => {
873            if ctx == GrammarContext::ExpectOrderBy {
874                GrammarContext::ExpectOrderByColumn
875            } else {
876                GrammarContext::General
877            }
878        }
879        "VALUES" => GrammarContext::ExpectValues,
880        "WITH" => GrammarContext::ExpectWithOption,
881        "ON" => {
882            // Could be INDEX ... ON table
883            GrammarContext::ExpectTable
884        }
885        "CONSISTENCY" => GrammarContext::ExpectConsistencyLevel,
886        "DESCRIBE" | "DESC" => GrammarContext::ExpectDescribeTarget,
887        "SOURCE" | "CAPTURE" => GrammarContext::ExpectFilePath,
888        "INSERT" => GrammarContext::General, // INSERT INTO -> INTO will set ExpectTable
889        "DELETE" => GrammarContext::General, // DELETE FROM -> FROM will set ExpectTable
890        "CREATE" | "ALTER" | "DROP" => GrammarContext::General,
891        "IF" => GrammarContext::ExpectColumn,
892        "LIMIT" => GrammarContext::General,
893        _ => {
894            // After an identifier in table/keyspace/column position, go to General
895            match ctx {
896                GrammarContext::ExpectTable
897                | GrammarContext::ExpectKeyspace
898                | GrammarContext::ExpectColumn
899                | GrammarContext::ExpectOrderByColumn
900                | GrammarContext::ExpectQualifiedPart
901                | GrammarContext::ExpectDescribeTarget => GrammarContext::General,
902                GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList, // stay in column list
903                GrammarContext::ExpectSetClause => GrammarContext::ExpectSetClause,
904                other => other,
905            }
906        }
907    }
908}
909
910/// Advance context after a value (string literal, number, etc.).
911fn advance_context_after_value(ctx: GrammarContext) -> GrammarContext {
912    match ctx {
913        GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
914        _ => GrammarContext::General,
915    }
916}
917
918/// Advance context after a name (quoted identifier, etc.).
919fn advance_context_after_name(ctx: GrammarContext) -> GrammarContext {
920    match ctx {
921        GrammarContext::ExpectTable
922        | GrammarContext::ExpectKeyspace
923        | GrammarContext::ExpectColumn
924        | GrammarContext::ExpectQualifiedPart
925        | GrammarContext::ExpectOrderByColumn
926        | GrammarContext::ExpectDescribeTarget => GrammarContext::General,
927        GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
928        other => other,
929    }
930}
931
932/// Check if any of the given keywords appear earlier in the significant token list.
933/// Find the keyword immediately before a table reference at the end of the token stream.
934/// Handles unqualified (`FROM table`) and qualified (`FROM ks.table`) patterns.
935fn find_keyword_before_table(significant: &[&Token]) -> Option<String> {
936    const TABLE_KEYWORDS: &[&str] = &["FROM", "INTO", "UPDATE", "TABLE", "TRUNCATE"];
937    let len = significant.len();
938    if len < 2 {
939        return None;
940    }
941
942    // Unqualified: [.., KEYWORD, table] — check len-2
943    let second_last_upper = significant[len - 2].text.to_uppercase();
944    if TABLE_KEYWORDS.contains(&second_last_upper.as_str()) {
945        return Some(second_last_upper);
946    }
947
948    // Qualified: [.., KEYWORD, ks, ".", table] — check len-4
949    if len >= 4 && significant[len - 2].text == "." {
950        let kw_upper = significant[len - 4].text.to_uppercase();
951        if TABLE_KEYWORDS.contains(&kw_upper.as_str()) {
952            return Some(kw_upper);
953        }
954    }
955
956    None
957}
958
959fn has_keyword_before(significant: &[&Token], keywords: &[&str]) -> bool {
960    significant.iter().rev().skip(1).any(|t| {
961        let upper = t.text.to_uppercase();
962        keywords.contains(&upper.as_str())
963    })
964}
965
966/// Determine if a '-' is in a position where it could be a negative number sign
967/// (after operator, punctuation, or at start).
968fn is_number_sign_position(tokens: &[Token]) -> bool {
969    match tokens.last() {
970        None => true,
971        Some(t) => matches!(
972            t.kind,
973            TokenKind::Operator
974                | TokenKind::Punctuation
975                | TokenKind::Keyword
976                | TokenKind::Whitespace
977        ),
978    }
979}
980
981/// Check if the text from `start` to `num_end` followed by '-' looks like the
982/// beginning of a UUID (8 hex digits).
983fn looks_like_uuid(input: &str, start: usize, num_end: usize) -> bool {
984    let segment = &input[start..num_end];
985    segment.len() == 8 && segment.chars().all(|c| c.is_ascii_hexdigit())
986}
987
988/// Scan a UUID pattern: 8-4-4-4-12 hex digits with dashes.
989/// Returns the end position if valid, or `start` if not.
990fn scan_uuid(input: &str, start: usize) -> usize {
991    let expected_segments = [8, 4, 4, 4, 12];
992    let bytes = input.as_bytes();
993    let len = bytes.len();
994    let mut pos = start;
995
996    for (seg_idx, &seg_len) in expected_segments.iter().enumerate() {
997        if seg_idx > 0 {
998            if pos >= len || bytes[pos] != b'-' {
999                return start;
1000            }
1001            pos += 1;
1002        }
1003        let seg_start = pos;
1004        while pos < len && bytes[pos].is_ascii_hexdigit() {
1005            pos += 1;
1006        }
1007        if pos - seg_start != seg_len {
1008            return start;
1009        }
1010    }
1011
1012    // Make sure UUID is not followed by alphanumeric (would be part of a longer token)
1013    if pos < len && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_') {
1014        return start;
1015    }
1016
1017    pos
1018}
1019
1020fn is_operator_char(ch: u8) -> bool {
1021    matches!(ch, b'=' | b'<' | b'>' | b'!' | b'+' | b'%')
1022}
1023
1024fn is_two_char_operator(first: u8, second: u8) -> bool {
1025    matches!((first, second), (b'<', b'=') | (b'>', b'=') | (b'!', b'='))
1026}
1027
1028fn is_punctuation(ch: u8) -> bool {
1029    matches!(
1030        ch,
1031        b';' | b',' | b'(' | b')' | b'.' | b'*' | b'?' | b'{' | b'}' | b'[' | b']' | b':'
1032    )
1033}
1034
1035/// Get the UTF-8 byte length of the char starting at position `i`.
1036fn char_len_at(bytes: &[u8], i: usize) -> usize {
1037    if i >= bytes.len() {
1038        return 1;
1039    }
1040    let b = bytes[i];
1041    if b < 0x80 {
1042        1
1043    } else if b < 0xE0 {
1044        2
1045    } else if b < 0xF0 {
1046        3
1047    } else {
1048        4
1049    }
1050}
1051
1052/// Extract only the significant (non-whitespace, non-comment) tokens.
1053pub fn significant_tokens(tokens: &[Token]) -> Vec<&Token> {
1054    tokens
1055        .iter()
1056        .filter(|t| {
1057            !matches!(
1058                t.kind,
1059                TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment
1060            )
1061        })
1062        .collect()
1063}
1064
1065/// Strip comments from CQL input, replacing block comments with a space
1066/// and removing line comments (preserving newlines).
1067///
1068/// This is a zero-allocation single-pass scanner (no `Vec<Token>` intermediary).
1069/// Used by the parser on every `feed_line` call, so performance matters.
1070pub fn strip_comments(input: &str) -> String {
1071    let mut result = String::with_capacity(input.len());
1072    let bytes = input.as_bytes();
1073    let len = bytes.len();
1074    let mut i = 0;
1075
1076    while i < len {
1077        let ch = bytes[i];
1078
1079        // Line comment: -- to end of line
1080        if ch == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
1081            i += 2;
1082            while i < len && bytes[i] != b'\n' {
1083                i += 1;
1084            }
1085            // newline itself is NOT part of the comment; the loop will pick it up
1086            continue;
1087        }
1088
1089        // Line comment: // to end of line (Python cqlsh compatibility)
1090        if ch == b'/' && i + 1 < len && bytes[i + 1] == b'/' {
1091            i += 2;
1092            while i < len && bytes[i] != b'\n' {
1093                i += 1;
1094            }
1095            continue;
1096        }
1097
1098        // Block comment: /* ... */ (nested)
1099        if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
1100            let mut depth: usize = 1;
1101            i += 2;
1102            while i < len && depth > 0 {
1103                if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
1104                    depth += 1;
1105                    i += 2;
1106                } else if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
1107                    depth -= 1;
1108                    i += 2;
1109                } else {
1110                    i += 1;
1111                }
1112            }
1113            result.push(' '); // replace block comment with space to avoid token merging
1114            continue;
1115        }
1116
1117        // Single-quoted string: skip through, copying verbatim
1118        if ch == b'\'' {
1119            let start = i;
1120            i += 1;
1121            loop {
1122                if i >= len {
1123                    break;
1124                }
1125                if bytes[i] == b'\'' {
1126                    i += 1;
1127                    if i < len && bytes[i] == b'\'' {
1128                        i += 1; // escaped ''
1129                        continue;
1130                    }
1131                    break;
1132                }
1133                i += char_len_at(bytes, i);
1134            }
1135            result.push_str(&input[start..i]);
1136            continue;
1137        }
1138
1139        // Double-quoted identifier: skip through, copying verbatim
1140        if ch == b'"' {
1141            let start = i;
1142            i += 1;
1143            loop {
1144                if i >= len {
1145                    break;
1146                }
1147                if bytes[i] == b'"' {
1148                    i += 1;
1149                    if i < len && bytes[i] == b'"' {
1150                        i += 1; // escaped ""
1151                        continue;
1152                    }
1153                    break;
1154                }
1155                i += char_len_at(bytes, i);
1156            }
1157            result.push_str(&input[start..i]);
1158            continue;
1159        }
1160
1161        // Dollar-quoted string: $$...$$
1162        if ch == b'$' && i + 1 < len && bytes[i + 1] == b'$' {
1163            let start = i;
1164            i += 2;
1165            loop {
1166                if i + 1 >= len {
1167                    i = len; // unterminated
1168                    break;
1169                }
1170                if bytes[i] == b'$' && bytes[i + 1] == b'$' {
1171                    i += 2;
1172                    break;
1173                }
1174                i += 1;
1175            }
1176            result.push_str(&input[start..i]);
1177            continue;
1178        }
1179
1180        // Regular character — copy through
1181        let clen = char_len_at(bytes, i);
1182        result.push_str(&input[i..i + clen]);
1183        i += clen;
1184    }
1185
1186    result
1187}
1188
1189#[cfg(test)]
1190mod tests {
1191    use super::*;
1192
1193    // ===== Helper =====
1194
1195    #[allow(dead_code)]
1196    fn token_kinds(input: &str) -> Vec<TokenKind> {
1197        tokenize(input).into_iter().map(|t| t.kind).collect()
1198    }
1199
1200    fn significant_kinds(input: &str) -> Vec<TokenKind> {
1201        tokenize(input)
1202            .into_iter()
1203            .filter(|t| t.kind != TokenKind::Whitespace)
1204            .map(|t| t.kind)
1205            .collect()
1206    }
1207
1208    fn significant_texts(input: &str) -> Vec<String> {
1209        tokenize(input)
1210            .into_iter()
1211            .filter(|t| t.kind != TokenKind::Whitespace)
1212            .map(|t| t.text)
1213            .collect()
1214    }
1215
1216    // ===== Token Kind Tests =====
1217
1218    #[test]
1219    fn keyword_select() {
1220        let tokens = tokenize("SELECT");
1221        assert_eq!(tokens.len(), 1);
1222        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1223        assert_eq!(tokens[0].text, "SELECT");
1224    }
1225
1226    #[test]
1227    fn keyword_case_insensitive() {
1228        let tokens = tokenize("select");
1229        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1230        assert_eq!(tokens[0].text, "select");
1231    }
1232
1233    #[test]
1234    fn keyword_mixed_case() {
1235        let tokens = tokenize("Select");
1236        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1237    }
1238
1239    #[test]
1240    fn identifier_plain() {
1241        // After FROM, words are identifiers even if they match keywords
1242        let tokens = tokenize("FROM users");
1243        let sig: Vec<_> = tokens
1244            .iter()
1245            .filter(|t| t.kind != TokenKind::Whitespace)
1246            .collect();
1247        assert_eq!(sig[0].kind, TokenKind::Keyword);
1248        assert_eq!(sig[1].kind, TokenKind::Identifier);
1249        assert_eq!(sig[1].text, "users");
1250    }
1251
1252    #[test]
1253    fn identifier_after_from_keyword_name() {
1254        // USERS after FROM should be identifier, not keyword
1255        let tokens = tokenize("SELECT * FROM USERS");
1256        let sig: Vec<_> = tokens
1257            .iter()
1258            .filter(|t| t.kind != TokenKind::Whitespace)
1259            .collect();
1260        assert_eq!(sig[3].text, "USERS");
1261        assert_eq!(sig[3].kind, TokenKind::Identifier);
1262    }
1263
1264    #[test]
1265    fn identifier_key_after_from() {
1266        let tokens = tokenize("SELECT * FROM KEY");
1267        let sig: Vec<_> = tokens
1268            .iter()
1269            .filter(|t| t.kind != TokenKind::Whitespace)
1270            .collect();
1271        assert_eq!(sig[3].text, "KEY");
1272        assert_eq!(sig[3].kind, TokenKind::Identifier);
1273    }
1274
1275    #[test]
1276    fn identifier_set_after_from() {
1277        let tokens = tokenize("SELECT * FROM SET");
1278        let sig: Vec<_> = tokens
1279            .iter()
1280            .filter(|t| t.kind != TokenKind::Whitespace)
1281            .collect();
1282        assert_eq!(sig[3].text, "SET");
1283        assert_eq!(sig[3].kind, TokenKind::Identifier);
1284    }
1285
1286    #[test]
1287    fn identifier_after_into() {
1288        let tokens = tokenize("INSERT INTO my_table");
1289        let sig: Vec<_> = tokens
1290            .iter()
1291            .filter(|t| t.kind != TokenKind::Whitespace)
1292            .collect();
1293        assert_eq!(sig[2].kind, TokenKind::Identifier);
1294    }
1295
1296    #[test]
1297    fn identifier_after_update() {
1298        let tokens = tokenize("UPDATE my_table SET");
1299        let sig: Vec<_> = tokens
1300            .iter()
1301            .filter(|t| t.kind != TokenKind::Whitespace)
1302            .collect();
1303        assert_eq!(sig[1].kind, TokenKind::Identifier);
1304        assert_eq!(sig[1].text, "my_table");
1305    }
1306
1307    #[test]
1308    fn identifier_after_dot() {
1309        let tokens = tokenize("ks.my_table");
1310        let sig: Vec<_> = tokens
1311            .iter()
1312            .filter(|t| t.kind != TokenKind::Whitespace)
1313            .collect();
1314        assert_eq!(sig[0].kind, TokenKind::Identifier); // ks at start is identifier? No, at Start it's not a keyword
1315                                                        // Actually "ks" is not a keyword, so it's Identifier
1316        assert_eq!(sig[1].kind, TokenKind::Punctuation); // .
1317        assert_eq!(sig[2].kind, TokenKind::Identifier); // my_table
1318    }
1319
1320    #[test]
1321    fn keyword_after_dot_is_identifier() {
1322        // SELECT after a dot should be identifier (qualified name)
1323        let tokens = tokenize("FROM ks.SELECT");
1324        let sig: Vec<_> = tokens
1325            .iter()
1326            .filter(|t| t.kind != TokenKind::Whitespace)
1327            .collect();
1328        assert_eq!(sig[2].kind, TokenKind::Punctuation); // .
1329        assert_eq!(sig[3].kind, TokenKind::Identifier); // SELECT as identifier
1330    }
1331
1332    #[test]
1333    fn quoted_identifier() {
1334        let tokens = tokenize("\"MyTable\"");
1335        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1336        assert_eq!(tokens[0].text, "\"MyTable\"");
1337    }
1338
1339    #[test]
1340    fn quoted_identifier_with_escape() {
1341        let tokens = tokenize("\"My\"\"Table\"");
1342        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1343        assert_eq!(tokens[0].text, "\"My\"\"Table\"");
1344    }
1345
1346    #[test]
1347    fn string_literal_simple() {
1348        let tokens = tokenize("'hello'");
1349        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1350        assert_eq!(tokens[0].text, "'hello'");
1351    }
1352
1353    #[test]
1354    fn string_literal_escaped_quote() {
1355        let tokens = tokenize("'it''s'");
1356        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1357        assert_eq!(tokens[0].text, "'it''s'");
1358    }
1359
1360    #[test]
1361    fn string_literal_with_semicolon() {
1362        let tokens = tokenize("'hello;world'");
1363        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1364        assert_eq!(tokens[0].text, "'hello;world'");
1365    }
1366
1367    #[test]
1368    fn string_literal_empty() {
1369        let tokens = tokenize("''");
1370        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1371        assert_eq!(tokens[0].text, "''");
1372    }
1373
1374    #[test]
1375    fn dollar_string_literal() {
1376        let tokens = tokenize("$$hello world$$");
1377        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1378        assert_eq!(tokens[0].text, "$$hello world$$");
1379    }
1380
1381    #[test]
1382    fn dollar_string_with_semicolon() {
1383        let tokens = tokenize("$$a;b$$");
1384        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1385    }
1386
1387    #[test]
1388    fn dollar_string_empty() {
1389        let tokens = tokenize("$$$$");
1390        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1391        assert_eq!(tokens[0].text, "$$$$");
1392    }
1393
1394    #[test]
1395    fn number_integer() {
1396        let tokens = tokenize("42");
1397        assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1398        assert_eq!(tokens[0].text, "42");
1399    }
1400
1401    #[test]
1402    fn number_decimal() {
1403        let tokens = tokenize("3.14");
1404        assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1405        assert_eq!(tokens[0].text, "3.14");
1406    }
1407
1408    #[test]
1409    fn number_negative() {
1410        let tokens = tokenize("= -1");
1411        let sig: Vec<_> = tokens
1412            .iter()
1413            .filter(|t| t.kind != TokenKind::Whitespace)
1414            .collect();
1415        assert_eq!(sig[1].kind, TokenKind::NumberLiteral);
1416        assert_eq!(sig[1].text, "-1");
1417    }
1418
1419    #[test]
1420    fn number_exponent() {
1421        let tokens = tokenize("1.5E10");
1422        assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1423        assert_eq!(tokens[0].text, "1.5E10");
1424    }
1425
1426    #[test]
1427    fn number_not_part_of_identifier() {
1428        let tokens = tokenize("LIMIT 100");
1429        let sig: Vec<_> = tokens
1430            .iter()
1431            .filter(|t| t.kind != TokenKind::Whitespace)
1432            .collect();
1433        assert_eq!(sig[1].kind, TokenKind::NumberLiteral);
1434    }
1435
1436    #[test]
1437    fn blob_literal() {
1438        let tokens = tokenize("0xDEADBEEF");
1439        assert_eq!(tokens[0].kind, TokenKind::BlobLiteral);
1440        assert_eq!(tokens[0].text, "0xDEADBEEF");
1441    }
1442
1443    #[test]
1444    fn blob_literal_lowercase() {
1445        let tokens = tokenize("0xdeadbeef");
1446        assert_eq!(tokens[0].kind, TokenKind::BlobLiteral);
1447    }
1448
1449    #[test]
1450    fn uuid_literal() {
1451        let tokens = tokenize("550e8400-e29b-41d4-a716-446655440000");
1452        assert_eq!(tokens[0].kind, TokenKind::UuidLiteral);
1453    }
1454
1455    #[test]
1456    fn boolean_true() {
1457        let tokens = tokenize("true");
1458        assert_eq!(tokens[0].kind, TokenKind::BooleanLiteral);
1459    }
1460
1461    #[test]
1462    fn boolean_false() {
1463        let tokens = tokenize("FALSE");
1464        assert_eq!(tokens[0].kind, TokenKind::BooleanLiteral);
1465    }
1466
1467    #[test]
1468    fn operator_equals() {
1469        let tokens = tokenize("=");
1470        assert_eq!(tokens[0].kind, TokenKind::Operator);
1471    }
1472
1473    #[test]
1474    fn operator_less_equal() {
1475        let tokens = tokenize("<=");
1476        assert_eq!(tokens[0].kind, TokenKind::Operator);
1477        assert_eq!(tokens[0].text, "<=");
1478    }
1479
1480    #[test]
1481    fn operator_greater_equal() {
1482        let tokens = tokenize(">=");
1483        assert_eq!(tokens[0].kind, TokenKind::Operator);
1484    }
1485
1486    #[test]
1487    fn operator_not_equal() {
1488        let tokens = tokenize("!=");
1489        assert_eq!(tokens[0].kind, TokenKind::Operator);
1490    }
1491
1492    #[test]
1493    fn punctuation_semicolon() {
1494        let tokens = tokenize(";");
1495        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1496    }
1497
1498    #[test]
1499    fn punctuation_comma() {
1500        let tokens = tokenize(",");
1501        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1502    }
1503
1504    #[test]
1505    fn punctuation_parens() {
1506        let kinds = significant_kinds("(x)");
1507        assert_eq!(
1508            kinds,
1509            vec![
1510                TokenKind::Punctuation,
1511                TokenKind::Identifier,
1512                TokenKind::Punctuation
1513            ]
1514        );
1515    }
1516
1517    #[test]
1518    fn punctuation_star() {
1519        let tokens = tokenize("*");
1520        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1521    }
1522
1523    #[test]
1524    fn punctuation_question_mark() {
1525        let tokens = tokenize("?");
1526        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1527    }
1528
1529    #[test]
1530    fn whitespace_space() {
1531        let tokens = tokenize("  ");
1532        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1533    }
1534
1535    #[test]
1536    fn whitespace_tab() {
1537        let tokens = tokenize("\t");
1538        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1539    }
1540
1541    #[test]
1542    fn whitespace_newline() {
1543        let tokens = tokenize("\n");
1544        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1545    }
1546
1547    #[test]
1548    fn line_comment() {
1549        let tokens = tokenize("-- this is a comment");
1550        assert_eq!(tokens[0].kind, TokenKind::LineComment);
1551        assert_eq!(tokens[0].text, "-- this is a comment");
1552    }
1553
1554    #[test]
1555    fn line_comment_stops_at_newline() {
1556        let tokens = tokenize("-- comment\nSELECT");
1557        assert_eq!(tokens[0].kind, TokenKind::LineComment);
1558        assert_eq!(tokens[0].text, "-- comment");
1559        // newline + SELECT follow
1560    }
1561
1562    #[test]
1563    fn block_comment() {
1564        let tokens = tokenize("/* block */");
1565        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1566        assert_eq!(tokens[0].text, "/* block */");
1567    }
1568
1569    #[test]
1570    fn block_comment_nested() {
1571        let tokens = tokenize("/* outer /* inner */ still */");
1572        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1573        assert_eq!(tokens[0].text, "/* outer /* inner */ still */");
1574    }
1575
1576    #[test]
1577    fn block_comment_with_semicolon() {
1578        let tokens = tokenize("/* ; */");
1579        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1580    }
1581
1582    #[test]
1583    fn unknown_char() {
1584        let tokens = tokenize("@");
1585        assert_eq!(tokens[0].kind, TokenKind::Unknown);
1586    }
1587
1588    // ===== Span Tests =====
1589
1590    #[test]
1591    fn spans_are_correct() {
1592        let tokens = tokenize("SELECT *");
1593        assert_eq!(tokens[0].start, 0);
1594        assert_eq!(tokens[0].end, 6);
1595        assert_eq!(tokens[1].start, 6);
1596        assert_eq!(tokens[1].end, 7);
1597        assert_eq!(tokens[2].start, 7);
1598        assert_eq!(tokens[2].end, 8);
1599    }
1600
1601    #[test]
1602    fn spans_cover_full_input() {
1603        let input = "SELECT * FROM users WHERE id = 1;";
1604        let tokens = tokenize(input);
1605        let last = tokens.last().unwrap();
1606        assert_eq!(last.end, input.len());
1607        // Verify no gaps
1608        for window in tokens.windows(2) {
1609            assert_eq!(
1610                window[0].end, window[1].start,
1611                "gap between {:?} and {:?}",
1612                window[0], window[1]
1613            );
1614        }
1615    }
1616
1617    // ===== Full Statement Tests =====
1618
1619    #[test]
1620    fn select_star_from_users() {
1621        let kinds = significant_kinds("SELECT * FROM users");
1622        assert_eq!(
1623            kinds,
1624            vec![
1625                TokenKind::Keyword,
1626                TokenKind::Punctuation,
1627                TokenKind::Keyword,
1628                TokenKind::Identifier
1629            ]
1630        );
1631    }
1632
1633    #[test]
1634    fn select_with_where() {
1635        let kinds = significant_kinds("SELECT name FROM users WHERE id = 1");
1636        assert_eq!(
1637            kinds,
1638            vec![
1639                TokenKind::Keyword,       // SELECT
1640                TokenKind::Identifier,    // name (in column list)
1641                TokenKind::Keyword,       // FROM
1642                TokenKind::Identifier,    // users
1643                TokenKind::Keyword,       // WHERE
1644                TokenKind::Identifier,    // id
1645                TokenKind::Operator,      // =
1646                TokenKind::NumberLiteral, // 1
1647            ]
1648        );
1649    }
1650
1651    #[test]
1652    fn insert_statement() {
1653        let kinds = significant_kinds("INSERT INTO my_table (id, name) VALUES (1, 'hello')");
1654        assert_eq!(
1655            kinds,
1656            vec![
1657                TokenKind::Keyword,       // INSERT
1658                TokenKind::Keyword,       // INTO
1659                TokenKind::Identifier,    // my_table
1660                TokenKind::Punctuation,   // (
1661                TokenKind::Identifier,    // id
1662                TokenKind::Punctuation,   // ,
1663                TokenKind::Identifier,    // name
1664                TokenKind::Punctuation,   // )
1665                TokenKind::Keyword,       // VALUES
1666                TokenKind::Punctuation,   // (
1667                TokenKind::NumberLiteral, // 1
1668                TokenKind::Punctuation,   // ,
1669                TokenKind::StringLiteral, // 'hello'
1670                TokenKind::Punctuation,   // )
1671            ]
1672        );
1673    }
1674
1675    #[test]
1676    fn update_statement() {
1677        let kinds = significant_kinds("UPDATE users SET name = 'Alice' WHERE id = 1");
1678        assert_eq!(
1679            kinds,
1680            vec![
1681                TokenKind::Keyword,       // UPDATE
1682                TokenKind::Identifier,    // users
1683                TokenKind::Keyword,       // SET
1684                TokenKind::Identifier,    // name
1685                TokenKind::Operator,      // =
1686                TokenKind::StringLiteral, // 'Alice'
1687                TokenKind::Keyword,       // WHERE
1688                TokenKind::Identifier,    // id
1689                TokenKind::Operator,      // =
1690                TokenKind::NumberLiteral, // 1
1691            ]
1692        );
1693    }
1694
1695    #[test]
1696    fn create_table() {
1697        let kinds = significant_kinds("CREATE TABLE ks.my_table (id int PRIMARY KEY)");
1698        assert_eq!(
1699            kinds,
1700            vec![
1701                TokenKind::Keyword,     // CREATE
1702                TokenKind::Keyword,     // TABLE
1703                TokenKind::Identifier,  // ks
1704                TokenKind::Punctuation, // .
1705                TokenKind::Identifier,  // my_table
1706                TokenKind::Punctuation, // (
1707                TokenKind::Identifier,  // id
1708                TokenKind::Identifier,  // int (type name, in general context after column name)
1709                TokenKind::Keyword,     // PRIMARY
1710                TokenKind::Keyword,     // KEY
1711                TokenKind::Punctuation, // )
1712            ]
1713        );
1714    }
1715
1716    #[test]
1717    fn use_keyspace() {
1718        let kinds = significant_kinds("USE my_keyspace");
1719        assert_eq!(kinds, vec![TokenKind::Keyword, TokenKind::Identifier]);
1720    }
1721
1722    #[test]
1723    fn qualified_table_name() {
1724        let texts = significant_texts("SELECT * FROM ks.users");
1725        assert_eq!(texts, vec!["SELECT", "*", "FROM", "ks", ".", "users"]);
1726        let kinds = significant_kinds("SELECT * FROM ks.users");
1727        assert_eq!(kinds[3], TokenKind::Identifier); // ks
1728        assert_eq!(kinds[4], TokenKind::Punctuation); // .
1729        assert_eq!(kinds[5], TokenKind::Identifier); // users
1730    }
1731
1732    #[test]
1733    fn statement_with_string_containing_keyword() {
1734        let kinds = significant_kinds("INSERT INTO t (v) VALUES ('SELECT FROM')");
1735        // 'SELECT FROM' should be one StringLiteral, not keywords
1736        assert!(kinds.contains(&TokenKind::StringLiteral));
1737        // Only 3 keywords: INSERT, INTO, VALUES
1738        assert_eq!(
1739            kinds.iter().filter(|k| **k == TokenKind::Keyword).count(),
1740            3
1741        );
1742    }
1743
1744    #[test]
1745    fn statement_with_comment() {
1746        let tokens = tokenize("SELECT 1 -- comment");
1747        let sig: Vec<_> = tokens
1748            .iter()
1749            .filter(|t| t.kind != TokenKind::Whitespace)
1750            .collect();
1751        assert_eq!(sig.len(), 3); // SELECT, 1, comment
1752        assert_eq!(sig[2].kind, TokenKind::LineComment);
1753    }
1754
1755    #[test]
1756    fn statement_with_block_comment() {
1757        let tokens = tokenize("SELECT /* mid */ 1");
1758        let sig: Vec<_> = tokens
1759            .iter()
1760            .filter(|t| t.kind != TokenKind::Whitespace)
1761            .collect();
1762        assert_eq!(sig[0].kind, TokenKind::Keyword); // SELECT
1763        assert_eq!(sig[1].kind, TokenKind::BlockComment); // /* mid */
1764        assert_eq!(sig[2].kind, TokenKind::NumberLiteral); // 1
1765    }
1766
1767    // ===== Grammar Context Tests =====
1768
1769    #[test]
1770    fn context_at_start() {
1771        assert_eq!(grammar_context_at_end(""), GrammarContext::Start);
1772    }
1773
1774    #[test]
1775    fn context_after_select() {
1776        assert_eq!(
1777            grammar_context_at_end("SELECT "),
1778            GrammarContext::ExpectColumnList
1779        );
1780    }
1781
1782    #[test]
1783    fn context_after_from() {
1784        assert_eq!(
1785            grammar_context_at_end("SELECT * FROM "),
1786            GrammarContext::ExpectTable
1787        );
1788    }
1789
1790    #[test]
1791    fn context_after_into() {
1792        assert_eq!(
1793            grammar_context_at_end("INSERT INTO "),
1794            GrammarContext::ExpectTable
1795        );
1796    }
1797
1798    #[test]
1799    fn context_after_update() {
1800        assert_eq!(
1801            grammar_context_at_end("UPDATE "),
1802            GrammarContext::ExpectTable
1803        );
1804    }
1805
1806    #[test]
1807    fn context_after_use() {
1808        assert_eq!(
1809            grammar_context_at_end("USE "),
1810            GrammarContext::ExpectKeyspace
1811        );
1812    }
1813
1814    #[test]
1815    fn context_after_where() {
1816        assert_eq!(
1817            grammar_context_at_end("SELECT * FROM t WHERE "),
1818            GrammarContext::ExpectColumn
1819        );
1820    }
1821
1822    #[test]
1823    fn context_after_dot() {
1824        assert_eq!(
1825            grammar_context_at_end("ks."),
1826            GrammarContext::ExpectQualifiedPart
1827        );
1828    }
1829
1830    #[test]
1831    fn context_after_table_name() {
1832        assert_eq!(
1833            grammar_context_at_end("SELECT * FROM users "),
1834            GrammarContext::ExpectSelectPostFrom
1835        );
1836    }
1837
1838    #[test]
1839    fn context_after_consistency() {
1840        assert_eq!(
1841            grammar_context_at_end("CONSISTENCY "),
1842            GrammarContext::ExpectConsistencyLevel
1843        );
1844    }
1845
1846    #[test]
1847    fn context_after_describe() {
1848        assert_eq!(
1849            grammar_context_at_end("DESCRIBE "),
1850            GrammarContext::ExpectDescribeTarget
1851        );
1852    }
1853
1854    #[test]
1855    fn context_after_source() {
1856        assert_eq!(
1857            grammar_context_at_end("SOURCE "),
1858            GrammarContext::ExpectFilePath
1859        );
1860    }
1861
1862    #[test]
1863    fn context_after_order_by() {
1864        assert_eq!(
1865            grammar_context_at_end("SELECT * FROM t ORDER BY "),
1866            GrammarContext::ExpectOrderByColumn
1867        );
1868    }
1869
1870    #[test]
1871    fn context_after_values() {
1872        assert_eq!(
1873            grammar_context_at_end("INSERT INTO t (id) VALUES "),
1874            GrammarContext::ExpectValues
1875        );
1876    }
1877
1878    #[test]
1879    fn context_after_with() {
1880        assert_eq!(
1881            grammar_context_at_end("CREATE TABLE t (id int) WITH "),
1882            GrammarContext::ExpectWithOption
1883        );
1884    }
1885
1886    // ===== Edge Cases =====
1887
1888    #[test]
1889    fn empty_input() {
1890        assert!(tokenize("").is_empty());
1891    }
1892
1893    #[test]
1894    fn only_whitespace() {
1895        let tokens = tokenize("   \t\n  ");
1896        assert_eq!(tokens.len(), 1);
1897        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1898    }
1899
1900    #[test]
1901    fn only_comment() {
1902        let tokens = tokenize("-- just a comment");
1903        assert_eq!(tokens.len(), 1);
1904        assert_eq!(tokens[0].kind, TokenKind::LineComment);
1905    }
1906
1907    #[test]
1908    fn unterminated_string() {
1909        let tokens = tokenize("'unterminated");
1910        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1911        assert_eq!(tokens[0].text, "'unterminated");
1912    }
1913
1914    #[test]
1915    fn unterminated_quoted_identifier() {
1916        let tokens = tokenize("\"unterminated");
1917        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1918    }
1919
1920    #[test]
1921    fn unterminated_dollar_string() {
1922        let tokens = tokenize("$$unterminated");
1923        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1924    }
1925
1926    #[test]
1927    fn unterminated_block_comment() {
1928        let tokens = tokenize("/* unterminated");
1929        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1930    }
1931
1932    #[test]
1933    fn unicode_in_string() {
1934        let tokens = tokenize("'héllo wörld'");
1935        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1936        assert_eq!(tokens[0].text, "'héllo wörld'");
1937    }
1938
1939    #[test]
1940    fn unicode_in_quoted_identifier() {
1941        let tokens = tokenize("\"naïve\"");
1942        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1943        assert_eq!(tokens[0].text, "\"naïve\"");
1944    }
1945
1946    #[test]
1947    fn multiple_statements() {
1948        let tokens = tokenize("SELECT 1; SELECT 2;");
1949        let semis: Vec<_> = tokens.iter().filter(|t| t.text == ";").collect();
1950        assert_eq!(semis.len(), 2);
1951    }
1952
1953    #[test]
1954    fn comment_like_in_string() {
1955        let tokens = tokenize("'-- not a comment'");
1956        assert_eq!(tokens.len(), 1);
1957        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1958    }
1959
1960    #[test]
1961    fn block_comment_like_in_string() {
1962        let tokens = tokenize("'/* not a comment */'");
1963        assert_eq!(tokens.len(), 1);
1964        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1965    }
1966
1967    #[test]
1968    fn negative_number_after_operator() {
1969        let tokens = tokenize("id = -42");
1970        let sig: Vec<_> = tokens
1971            .iter()
1972            .filter(|t| t.kind != TokenKind::Whitespace)
1973            .collect();
1974        assert_eq!(sig[2].kind, TokenKind::NumberLiteral);
1975        assert_eq!(sig[2].text, "-42");
1976    }
1977
1978    #[test]
1979    fn minus_as_operator_after_number() {
1980        let tokens = tokenize("5 - 3");
1981        let sig: Vec<_> = tokens
1982            .iter()
1983            .filter(|t| t.kind != TokenKind::Whitespace)
1984            .collect();
1985        assert_eq!(sig[0].kind, TokenKind::NumberLiteral);
1986        // After a number, - is an operator, not a sign. But the next 3 is a separate number.
1987        // The '-' here: previous token is whitespace (after 5), so it could be sign.
1988        // Actually after NumberLiteral + Whitespace, the '-' is ambiguous.
1989        // Let's just verify we get reasonable output.
1990        assert!(sig.len() >= 3);
1991    }
1992
1993    #[test]
1994    fn blob_after_value_context() {
1995        let tokens = tokenize("INSERT INTO t (b) VALUES (0xDEAD)");
1996        let sig: Vec<_> = tokens
1997            .iter()
1998            .filter(|t| t.kind != TokenKind::Whitespace)
1999            .collect();
2000        let blob = sig.iter().find(|t| t.text == "0xDEAD").unwrap();
2001        assert_eq!(blob.kind, TokenKind::BlobLiteral);
2002    }
2003
2004    #[test]
2005    fn keyword_list_is_sorted() {
2006        for window in CQL_KEYWORDS.windows(2) {
2007            assert!(
2008                window[0] < window[1],
2009                "CQL_KEYWORDS not sorted: {:?} >= {:?}",
2010                window[0],
2011                window[1]
2012            );
2013        }
2014    }
2015
2016    // ===== strip_comments =====
2017
2018    #[test]
2019    fn strip_line_comment() {
2020        let result = strip_comments("SELECT 1 -- comment");
2021        assert_eq!(result, "SELECT 1 ");
2022    }
2023
2024    #[test]
2025    fn strip_block_comment() {
2026        let result = strip_comments("SELECT /* x */ 1");
2027        assert_eq!(result, "SELECT   1");
2028    }
2029
2030    #[test]
2031    fn strip_preserves_strings() {
2032        let result = strip_comments("SELECT '-- not a comment'");
2033        assert_eq!(result, "SELECT '-- not a comment'");
2034    }
2035
2036    #[test]
2037    fn strip_preserves_dollar_strings() {
2038        let result = strip_comments("SELECT $$-- not a comment$$");
2039        assert_eq!(result, "SELECT $$-- not a comment$$");
2040    }
2041
2042    #[test]
2043    fn strip_nested_block_comments() {
2044        let result = strip_comments("SELECT /* outer /* inner */ still */ 1");
2045        assert_eq!(result, "SELECT   1");
2046    }
2047
2048    #[test]
2049    fn strip_slash_slash_comment() {
2050        let result = strip_comments("SELECT 1 // comment");
2051        assert_eq!(result, "SELECT 1 ");
2052    }
2053
2054    #[test]
2055    fn strip_slash_slash_preserves_strings() {
2056        let result = strip_comments("SELECT '// not a comment'");
2057        assert_eq!(result, "SELECT '// not a comment'");
2058    }
2059
2060    // ===== is_cql_keyword =====
2061
2062    #[test]
2063    fn keyword_lookup_positive() {
2064        assert!(is_cql_keyword("SELECT"));
2065        assert!(is_cql_keyword("select"));
2066        assert!(is_cql_keyword("From"));
2067        assert!(is_cql_keyword("WHERE"));
2068    }
2069
2070    #[test]
2071    fn keyword_lookup_negative() {
2072        assert!(!is_cql_keyword("my_table"));
2073        assert!(!is_cql_keyword("hello"));
2074        assert!(!is_cql_keyword("xyz"));
2075    }
2076
2077    // ===== significant_tokens helper =====
2078
2079    #[test]
2080    fn significant_tokens_filters_whitespace_and_comments() {
2081        let tokens = tokenize("SELECT /* comment */ * -- line\nFROM t");
2082        let sig = significant_tokens(&tokens);
2083        let kinds: Vec<_> = sig.iter().map(|t| &t.kind).collect();
2084        assert!(!kinds.contains(&&TokenKind::Whitespace));
2085        assert!(!kinds.contains(&&TokenKind::LineComment));
2086        assert!(!kinds.contains(&&TokenKind::BlockComment));
2087    }
2088
2089    // ===== Regression: colorizer false-positive tests =====
2090
2091    #[test]
2092    fn users_not_keyword_after_from() {
2093        let tokens = tokenize("SELECT * FROM users");
2094        let sig: Vec<_> = tokens
2095            .iter()
2096            .filter(|t| t.kind != TokenKind::Whitespace)
2097            .collect();
2098        assert_eq!(sig[3].text, "users");
2099        assert_eq!(sig[3].kind, TokenKind::Identifier);
2100    }
2101
2102    #[test]
2103    fn key_not_keyword_after_from() {
2104        let tokens = tokenize("SELECT key FROM my_table WHERE key = 1");
2105        let sig: Vec<_> = tokens
2106            .iter()
2107            .filter(|t| t.kind != TokenKind::Whitespace)
2108            .collect();
2109        // "key" after SELECT is in column list -> Identifier
2110        assert_eq!(sig[1].kind, TokenKind::Identifier);
2111        // "my_table" after FROM -> Identifier
2112        assert_eq!(sig[3].kind, TokenKind::Identifier);
2113        // "key" after WHERE -> Identifier
2114        assert_eq!(sig[5].kind, TokenKind::Identifier);
2115    }
2116
2117    #[test]
2118    fn set_not_keyword_in_column_list() {
2119        // "set" as a column name in SELECT
2120        let tokens = tokenize("SELECT set FROM my_table");
2121        let sig: Vec<_> = tokens
2122            .iter()
2123            .filter(|t| t.kind != TokenKind::Whitespace)
2124            .collect();
2125        assert_eq!(sig[1].text, "set");
2126        assert_eq!(sig[1].kind, TokenKind::Identifier);
2127    }
2128
2129    #[test]
2130    fn column_names_after_where_are_identifiers() {
2131        let tokens = tokenize("SELECT * FROM t WHERE user = 'test' AND key = 1");
2132        let sig: Vec<_> = tokens
2133            .iter()
2134            .filter(|t| t.kind != TokenKind::Whitespace)
2135            .collect();
2136        // user after WHERE
2137        assert_eq!(sig[5].text, "user");
2138        assert_eq!(sig[5].kind, TokenKind::Identifier);
2139        // key after AND
2140        assert_eq!(sig[9].text, "key");
2141        assert_eq!(sig[9].kind, TokenKind::Identifier);
2142    }
2143
2144    // ===== Complex queries =====
2145
2146    #[test]
2147    fn select_with_function() {
2148        let tokens = tokenize("SELECT count(*) FROM users");
2149        let sig: Vec<_> = tokens
2150            .iter()
2151            .filter(|t| t.kind != TokenKind::Whitespace)
2152            .collect();
2153        assert_eq!(sig[0].kind, TokenKind::Keyword); // SELECT
2154        assert_eq!(sig[1].kind, TokenKind::Identifier); // count (in column list context)
2155    }
2156
2157    #[test]
2158    fn batch_statement() {
2159        let input =
2160            "BEGIN BATCH INSERT INTO t (id) VALUES (1); INSERT INTO t (id) VALUES (2); APPLY BATCH";
2161        let tokens = tokenize(input);
2162        let keywords: Vec<_> = tokens
2163            .iter()
2164            .filter(|t| t.kind == TokenKind::Keyword)
2165            .collect();
2166        assert!(keywords.iter().any(|t| t.text.to_uppercase() == "BEGIN"));
2167        assert!(keywords.iter().any(|t| t.text.to_uppercase() == "BATCH"));
2168        assert!(keywords.iter().any(|t| t.text.to_uppercase() == "APPLY"));
2169    }
2170
2171    #[test]
2172    fn delete_from() {
2173        let kinds = significant_kinds("DELETE FROM users WHERE id = 1");
2174        assert_eq!(kinds[0], TokenKind::Keyword); // DELETE
2175        assert_eq!(kinds[1], TokenKind::Keyword); // FROM
2176        assert_eq!(kinds[2], TokenKind::Identifier); // users
2177    }
2178
2179    #[test]
2180    fn describe_table() {
2181        let kinds = significant_kinds("DESCRIBE TABLE users");
2182        assert_eq!(kinds[0], TokenKind::Keyword); // DESCRIBE
2183        assert_eq!(kinds[1], TokenKind::Keyword); // TABLE
2184        assert_eq!(kinds[2], TokenKind::Identifier); // users
2185    }
2186
2187    #[test]
2188    fn truncate_table() {
2189        let kinds = significant_kinds("TRUNCATE users");
2190        assert_eq!(kinds[0], TokenKind::Keyword); // TRUNCATE
2191        assert_eq!(kinds[1], TokenKind::Identifier); // users
2192    }
2193
2194    #[test]
2195    fn select_distinct() {
2196        let kinds = significant_kinds("SELECT DISTINCT partition_key FROM t");
2197        assert_eq!(kinds[0], TokenKind::Keyword); // SELECT
2198        assert_eq!(kinds[1], TokenKind::Keyword); // DISTINCT
2199        assert_eq!(kinds[2], TokenKind::Identifier); // partition_key
2200    }
2201
2202    #[test]
2203    fn consistency_level() {
2204        let kinds = significant_kinds("CONSISTENCY QUORUM");
2205        assert_eq!(kinds[0], TokenKind::Keyword); // CONSISTENCY
2206        assert_eq!(kinds[1], TokenKind::Identifier); // QUORUM (in consistency level context)
2207    }
2208
2209    #[test]
2210    fn serial_consistency() {
2211        let ctx = grammar_context_at_end("SERIAL CONSISTENCY ");
2212        assert_eq!(ctx, GrammarContext::ExpectConsistencyLevel);
2213    }
2214
2215    #[test]
2216    fn order_by_column() {
2217        let sig: Vec<_> = tokenize("SELECT * FROM t ORDER BY created_at")
2218            .into_iter()
2219            .filter(|t| t.kind != TokenKind::Whitespace)
2220            .collect();
2221        assert_eq!(sig.last().unwrap().kind, TokenKind::Identifier); // created_at
2222    }
2223
2224    // ===== Tests for SP20: Grammar-aware completion =====
2225
2226    #[test]
2227    fn grammar_context_qualified_table_after_from() {
2228        // Bug #85: SELECT * FROM system.c⇥ should recognize table context, not fall to General
2229        let ctx = grammar_context_at_end("SELECT * FROM system.");
2230        assert_eq!(ctx, GrammarContext::ExpectQualifiedPart);
2231    }
2232
2233    #[test]
2234    fn grammar_context_qualified_table_partial() {
2235        // When user has started typing after the dot
2236        let ctx = grammar_context_at_end("SELECT * FROM system.c");
2237        // Last token is 'c' (identifier), second-to-last is '.', third-to-last is 'system'
2238        // Should still recognize we're in table context with keyspace qualifier
2239        // For now, this will fail and show the bug
2240        assert_eq!(ctx, GrammarContext::ExpectQualifiedPart);
2241    }
2242
2243    #[test]
2244    fn grammar_context_select_column_list() {
2245        let ctx = grammar_context_at_end("SELECT ");
2246        assert_eq!(ctx, GrammarContext::ExpectColumnList);
2247    }
2248
2249    #[test]
2250    fn grammar_context_create_target() {
2251        assert_eq!(
2252            grammar_context_at_end("CREATE "),
2253            GrammarContext::ExpectCreateTarget
2254        );
2255    }
2256
2257    #[test]
2258    fn grammar_context_alter_target() {
2259        assert_eq!(
2260            grammar_context_at_end("ALTER "),
2261            GrammarContext::ExpectAlterTarget
2262        );
2263    }
2264
2265    #[test]
2266    fn grammar_context_drop_target() {
2267        assert_eq!(
2268            grammar_context_at_end("DROP "),
2269            GrammarContext::ExpectDropTarget
2270        );
2271    }
2272
2273    #[test]
2274    fn grammar_context_delete_target() {
2275        assert_eq!(
2276            grammar_context_at_end("DELETE "),
2277            GrammarContext::ExpectDeleteTarget
2278        );
2279    }
2280
2281    #[test]
2282    fn grammar_context_grant_permissions() {
2283        assert_eq!(
2284            grammar_context_at_end("GRANT "),
2285            GrammarContext::ExpectGrantRevoke
2286        );
2287    }
2288
2289    #[test]
2290    fn grammar_context_revoke_permissions() {
2291        assert_eq!(
2292            grammar_context_at_end("REVOKE "),
2293            GrammarContext::ExpectGrantRevoke
2294        );
2295    }
2296
2297    #[test]
2298    fn grammar_context_insert_target() {
2299        assert_eq!(
2300            grammar_context_at_end("INSERT "),
2301            GrammarContext::ExpectInsertTarget
2302        );
2303    }
2304
2305    #[test]
2306    fn grammar_context_insert_into_table() {
2307        assert_eq!(
2308            grammar_context_at_end("INSERT INTO "),
2309            GrammarContext::ExpectTable
2310        );
2311    }
2312
2313    #[test]
2314    fn grammar_context_begin_target() {
2315        assert_eq!(
2316            grammar_context_at_end("BEGIN "),
2317            GrammarContext::ExpectBeginTarget
2318        );
2319    }
2320
2321    #[test]
2322    fn grammar_context_update_expects_table() {
2323        assert_eq!(
2324            grammar_context_at_end("UPDATE "),
2325            GrammarContext::ExpectTable
2326        );
2327    }
2328
2329    #[test]
2330    fn grammar_context_truncate_expects_table() {
2331        assert_eq!(
2332            grammar_context_at_end("TRUNCATE "),
2333            GrammarContext::ExpectTable
2334        );
2335    }
2336
2337    // ===== Bug 7/9/10: qualified table names trigger post-table context =====
2338
2339    #[test]
2340    fn grammar_context_select_post_from_qualified() {
2341        // SELECT * FROM ks.table<space> should give ExpectSelectPostFrom, not General
2342        assert_eq!(
2343            grammar_context_at_end("SELECT * FROM test_ks.users "),
2344            GrammarContext::ExpectSelectPostFrom
2345        );
2346    }
2347
2348    #[test]
2349    fn grammar_context_update_clause_qualified() {
2350        // UPDATE ks.table<space> should give ExpectUpdateClause
2351        assert_eq!(
2352            grammar_context_at_end("UPDATE test_ks.users "),
2353            GrammarContext::ExpectUpdateClause
2354        );
2355    }
2356
2357    #[test]
2358    fn grammar_context_delete_post_from_qualified() {
2359        // DELETE FROM ks.table<space> should give ExpectDeletePostFrom
2360        assert_eq!(
2361            grammar_context_at_end("DELETE FROM test_ks.users "),
2362            GrammarContext::ExpectDeletePostFrom
2363        );
2364    }
2365
2366    #[test]
2367    fn grammar_context_insert_into_qualified() {
2368        // INSERT INTO ks.table<space> — INTO is the keyword before table
2369        // Current code returns General for INTO, which is acceptable
2370        assert_eq!(
2371            grammar_context_at_end("INSERT INTO test_ks.users "),
2372            GrammarContext::General
2373        );
2374    }
2375}