cqlsh_rs/
cql_lexer.rs

1//! Unified CQL lexer (tokenizer) with grammar-aware position tracking.
2//!
3//! Provides a single shared tokenizer that powers syntax highlighting (colorizer),
4//! tab completion (completer), and statement parsing (parser). Replaces three
5//! ad-hoc implementations with one consistent CQL understanding.
6//!
7//! Design: hand-written state machine, O(n) single pass, no dependencies.
8//! See `docs/plans/18-cql-lexer.md` for motivation and design decisions.
9
10/// A token produced by the CQL lexer.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct Token {
13    /// The kind of token.
14    pub kind: TokenKind,
15    /// The raw text of the token as it appears in the input.
16    pub text: String,
17    /// Byte offset where this token starts in the input.
18    pub start: usize,
19    /// Byte offset where this token ends (exclusive) in the input.
20    pub end: usize,
21}
22
23/// Classification of a CQL token.
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum TokenKind {
26    /// A CQL keyword (SELECT, FROM, WHERE, etc.). Determined by context — words
27    /// in identifier position (after FROM, after dot, etc.) are `Identifier` instead.
28    Keyword,
29    /// An unquoted identifier (table name, column name, keyspace name, etc.).
30    Identifier,
31    /// A double-quoted identifier (`"MyTable"`).
32    QuotedIdentifier,
33    /// A single-quoted string literal (`'hello'`).
34    StringLiteral,
35    /// A dollar-quoted string literal (`$$body$$`).
36    DollarStringLiteral,
37    /// A numeric literal (integer or decimal: `42`, `3.14`, `-1`).
38    NumberLiteral,
39    /// A blob literal (`0xDEADBEEF`).
40    BlobLiteral,
41    /// A UUID literal (`550e8400-e29b-41d4-a716-446655440000`).
42    UuidLiteral,
43    /// A boolean literal (`true`, `false`).
44    BooleanLiteral,
45    /// An operator (`=`, `<`, `>`, `<=`, `>=`, `!=`, `+`, `-`, etc.).
46    Operator,
47    /// Punctuation (`;`, `,`, `(`, `)`, `.`, `*`, `?`).
48    Punctuation,
49    /// Whitespace (spaces, tabs, newlines).
50    Whitespace,
51    /// A line comment (`-- ...`).
52    LineComment,
53    /// A block comment (`/* ... */`), possibly nested.
54    BlockComment,
55    /// Unrecognized character.
56    Unknown,
57}
58
59/// Grammar context: what syntactic position we're at, used to distinguish
60/// keywords from identifiers and to drive tab completion.
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum GrammarContext {
63    /// Start of statement — expecting a keyword like SELECT, INSERT, etc.
64    Start,
65    /// After SELECT — expecting column list, *, or DISTINCT.
66    ExpectColumnList,
67    /// After FROM / INTO / UPDATE / TABLE / INDEX ON — expecting a table name.
68    ExpectTable,
69    /// After USE / KEYSPACE — expecting a keyspace name.
70    ExpectKeyspace,
71    /// After WHERE / AND / IF (conditions) — expecting a column name.
72    ExpectColumn,
73    /// After SET (in UPDATE) — expecting column = value pairs.
74    ExpectSetClause,
75    /// After a dot — expecting the second part of a qualified name.
76    ExpectQualifiedPart,
77    /// After a column/table type keyword — expecting a CQL type name.
78    ExpectType,
79    /// After ORDER — expecting BY.
80    ExpectOrderBy,
81    /// After ORDER BY — expecting column name.
82    ExpectOrderByColumn,
83    /// After VALUES — expecting ( value_list ).
84    ExpectValues,
85    /// Inside WITH clause options.
86    ExpectWithOption,
87    /// After CONSISTENCY / SERIAL CONSISTENCY — expecting level name.
88    ExpectConsistencyLevel,
89    /// After DESCRIBE / DESC — expecting sub-command or schema name.
90    ExpectDescribeTarget,
91    /// After SOURCE / CAPTURE — expecting file path.
92    ExpectFilePath,
93    /// General clause context (default within a statement body).
94    General,
95}
96
97/// Set of CQL keywords and shell commands (uppercase, sorted for binary search).
98const CQL_KEYWORDS: &[&str] = &[
99    "ADD",
100    "AGGREGATE",
101    "ALL",
102    "ALLOW",
103    "ALTER",
104    "AND",
105    "APPLY",
106    "AS",
107    "ASC",
108    "AUTHORIZE",
109    "BATCH",
110    "BEGIN",
111    "BY",
112    "CALLED",
113    "CAPTURE",
114    "CLEAR",
115    "CLS",
116    "CLUSTERING",
117    "COLUMN",
118    "COMPACT",
119    "CONSISTENCY",
120    "CONTAINS",
121    "COPY",
122    "COUNT",
123    "COUNTER",
124    "CREATE",
125    "CUSTOM",
126    "DELETE",
127    "DESC",
128    "DESCRIBE",
129    "DISTINCT",
130    "DROP",
131    "EACH_QUORUM",
132    "ENTRIES",
133    "EXECUTE",
134    "EXISTS",
135    "EXIT",
136    "EXPAND",
137    "FILTERING",
138    "FINALFUNC",
139    "FROM",
140    "FROZEN",
141    "FULL",
142    "FUNCTION",
143    "FUNCTIONS",
144    "GRANT",
145    "HELP",
146    "IF",
147    "IN",
148    "INDEX",
149    "INITCOND",
150    "INPUT",
151    "INSERT",
152    "INTO",
153    "IS",
154    "JSON",
155    "KEY",
156    "KEYSPACE",
157    "KEYSPACES",
158    "LANGUAGE",
159    "LIKE",
160    "LIMIT",
161    "LIST",
162    "LOCAL_ONE",
163    "LOCAL_QUORUM",
164    "LOGIN",
165    "MAP",
166    "MATERIALIZED",
167    "MODIFY",
168    "NAMESPACE",
169    "NORECURSIVE",
170    "NOT",
171    "NULL",
172    "OF",
173    "ON",
174    "ONE",
175    "OR",
176    "ORDER",
177    "PAGING",
178    "PARTITION",
179    "PASSWORD",
180    "PER",
181    "PERMISSION",
182    "PERMISSIONS",
183    "PRIMARY",
184    "QUIT",
185    "QUORUM",
186    "RENAME",
187    "REPLACE",
188    "RETURNS",
189    "REVOKE",
190    "SCHEMA",
191    "SELECT",
192    "SERIAL",
193    "SET",
194    "SFUNC",
195    "SHOW",
196    "SOURCE",
197    "STATIC",
198    "STORAGE",
199    "STYPE",
200    "SUPERUSER",
201    "TABLE",
202    "TABLES",
203    "TEXT",
204    "THREE",
205    "TIMESTAMP",
206    "TO",
207    "TOKEN",
208    "TRACING",
209    "TRIGGER",
210    "TRUNCATE",
211    "TTL",
212    "TUPLE",
213    "TWO",
214    "TYPE",
215    "UNICODE",
216    "UNLOGGED",
217    "UPDATE",
218    "USE",
219    "USER",
220    "USERS",
221    "USING",
222    "VALUES",
223    "VIEW",
224    "WHERE",
225    "WITH",
226    "WRITETIME",
227];
228
229/// Check if a word is a CQL keyword (case-insensitive).
230pub fn is_cql_keyword(word: &str) -> bool {
231    let upper = word.to_uppercase();
232    CQL_KEYWORDS.binary_search(&upper.as_str()).is_ok()
233}
234
235/// Tokenize a CQL input string into a sequence of tokens with grammar context.
236///
237/// This is the main entry point. It performs a single O(n) pass and classifies
238/// each token using both lexical rules and grammar position tracking.
239pub fn tokenize(input: &str) -> Vec<Token> {
240    let mut tokens = Vec::new();
241    let mut ctx = GrammarContext::Start;
242    let bytes = input.as_bytes();
243    let len = bytes.len();
244    let mut i = 0;
245
246    while i < len {
247        let ch = bytes[i];
248
249        // Whitespace
250        if ch.is_ascii_whitespace() {
251            let start = i;
252            while i < len && bytes[i].is_ascii_whitespace() {
253                i += 1;
254            }
255            tokens.push(Token {
256                kind: TokenKind::Whitespace,
257                text: input[start..i].to_string(),
258                start,
259                end: i,
260            });
261            continue;
262        }
263
264        // Line comment: --
265        if ch == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
266            let start = i;
267            i += 2;
268            while i < len && bytes[i] != b'\n' {
269                i += 1;
270            }
271            tokens.push(Token {
272                kind: TokenKind::LineComment,
273                text: input[start..i].to_string(),
274                start,
275                end: i,
276            });
277            continue;
278        }
279
280        // Block comment: /* ... */ (nested)
281        if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
282            let start = i;
283            let mut depth: usize = 1;
284            i += 2;
285            while i < len && depth > 0 {
286                if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
287                    depth += 1;
288                    i += 2;
289                } else if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
290                    depth -= 1;
291                    i += 2;
292                } else {
293                    i += 1;
294                }
295            }
296            tokens.push(Token {
297                kind: TokenKind::BlockComment,
298                text: input[start..i].to_string(),
299                start,
300                end: i,
301            });
302            continue;
303        }
304
305        // Single-quoted string literal
306        if ch == b'\'' {
307            let start = i;
308            i += 1;
309            loop {
310                if i >= len {
311                    break; // unterminated
312                }
313                if bytes[i] == b'\'' {
314                    i += 1;
315                    // Escaped quote '' — continue string
316                    if i < len && bytes[i] == b'\'' {
317                        i += 1;
318                        continue;
319                    }
320                    break;
321                }
322                // Advance by UTF-8 char length
323                i += char_len_at(bytes, i);
324            }
325            tokens.push(Token {
326                kind: TokenKind::StringLiteral,
327                text: input[start..i].to_string(),
328                start,
329                end: i,
330            });
331            ctx = advance_context_after_value(ctx);
332            continue;
333        }
334
335        // Double-quoted identifier
336        if ch == b'"' {
337            let start = i;
338            i += 1;
339            loop {
340                if i >= len {
341                    break; // unterminated
342                }
343                if bytes[i] == b'"' {
344                    i += 1;
345                    // Escaped quote "" — continue
346                    if i < len && bytes[i] == b'"' {
347                        i += 1;
348                        continue;
349                    }
350                    break;
351                }
352                i += char_len_at(bytes, i);
353            }
354            tokens.push(Token {
355                kind: TokenKind::QuotedIdentifier,
356                text: input[start..i].to_string(),
357                start,
358                end: i,
359            });
360            ctx = advance_context_after_name(ctx);
361            continue;
362        }
363
364        // Dollar-quoted string: $$...$$
365        if ch == b'$' && i + 1 < len && bytes[i + 1] == b'$' {
366            let start = i;
367            i += 2;
368            loop {
369                if i + 1 >= len {
370                    i = len;
371                    break; // unterminated
372                }
373                if bytes[i] == b'$' && bytes[i + 1] == b'$' {
374                    i += 2;
375                    break;
376                }
377                i += 1;
378            }
379            tokens.push(Token {
380                kind: TokenKind::DollarStringLiteral,
381                text: input[start..i].to_string(),
382                start,
383                end: i,
384            });
385            ctx = advance_context_after_value(ctx);
386            continue;
387        }
388
389        // Blob literal: 0x followed by hex digits
390        if ch == b'0' && i + 1 < len && (bytes[i + 1] == b'x' || bytes[i + 1] == b'X') {
391            let start = i;
392            i += 2;
393            while i < len && bytes[i].is_ascii_hexdigit() {
394                i += 1;
395            }
396            // Make sure it's not followed by an identifier char (would be an identifier)
397            if i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
398                // Actually an identifier starting with 0x... — backtrack
399                i = start;
400            } else {
401                tokens.push(Token {
402                    kind: TokenKind::BlobLiteral,
403                    text: input[start..i].to_string(),
404                    start,
405                    end: i,
406                });
407                ctx = advance_context_after_value(ctx);
408                continue;
409            }
410        }
411
412        // Number literal: digits, optional dot, optional exponent
413        // Also handles negative numbers when preceded by operator context
414        if ch.is_ascii_digit()
415            || (ch == b'-'
416                && i + 1 < len
417                && bytes[i + 1].is_ascii_digit()
418                && is_number_sign_position(&tokens))
419        {
420            let start = i;
421            if ch == b'-' {
422                i += 1;
423            }
424            while i < len && bytes[i].is_ascii_digit() {
425                i += 1;
426            }
427            // Decimal part
428            if i < len && bytes[i] == b'.' && i + 1 < len && bytes[i + 1].is_ascii_digit() {
429                i += 1;
430                while i < len && bytes[i].is_ascii_digit() {
431                    i += 1;
432                }
433            }
434            // Exponent
435            if i < len && (bytes[i] == b'e' || bytes[i] == b'E') {
436                let save = i;
437                i += 1;
438                if i < len && (bytes[i] == b'+' || bytes[i] == b'-') {
439                    i += 1;
440                }
441                if i < len && bytes[i].is_ascii_digit() {
442                    while i < len && bytes[i].is_ascii_digit() {
443                        i += 1;
444                    }
445                } else {
446                    i = save; // not an exponent, backtrack
447                }
448            }
449            // UUID check: number followed by '-' hex pattern (8-4-4-4-12)
450            if i < len && bytes[i] == b'-' && looks_like_uuid(input, start, i) {
451                // Parse the full UUID
452                let uuid_end = scan_uuid(input, start);
453                if uuid_end > i {
454                    i = uuid_end;
455                    tokens.push(Token {
456                        kind: TokenKind::UuidLiteral,
457                        text: input[start..i].to_string(),
458                        start,
459                        end: i,
460                    });
461                    ctx = advance_context_after_value(ctx);
462                    continue;
463                }
464            }
465            // Make sure this isn't part of an identifier (like table1)
466            if i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
467                // It's an identifier starting with digits — shouldn't happen in valid CQL,
468                // but treat as identifier
469                while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
470                    i += 1;
471                }
472                let word = &input[start..i];
473                tokens.push(Token {
474                    kind: classify_word(word, ctx),
475                    text: word.to_string(),
476                    start,
477                    end: i,
478                });
479                ctx = advance_context_after_word(word, ctx);
480                continue;
481            }
482            tokens.push(Token {
483                kind: TokenKind::NumberLiteral,
484                text: input[start..i].to_string(),
485                start,
486                end: i,
487            });
488            ctx = advance_context_after_value(ctx);
489            continue;
490        }
491
492        // Word: identifier or keyword
493        if ch.is_ascii_alphabetic() || ch == b'_' {
494            let start = i;
495            while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
496                i += 1;
497            }
498            let word = &input[start..i];
499
500            // UUID check: word of hex chars followed by '-' hex pattern
501            if i < len
502                && bytes[i] == b'-'
503                && word.len() == 8
504                && word.chars().all(|c| c.is_ascii_hexdigit())
505            {
506                let uuid_end = scan_uuid(input, start);
507                if uuid_end > i {
508                    i = uuid_end;
509                    tokens.push(Token {
510                        kind: TokenKind::UuidLiteral,
511                        text: input[start..i].to_string(),
512                        start,
513                        end: i,
514                    });
515                    ctx = advance_context_after_value(ctx);
516                    continue;
517                }
518            }
519
520            let kind = classify_word(word, ctx);
521            tokens.push(Token {
522                kind,
523                text: word.to_string(),
524                start,
525                end: i,
526            });
527            ctx = advance_context_after_word(word, ctx);
528            continue;
529        }
530
531        // Operators
532        if is_operator_char(ch) {
533            let start = i;
534            // Two-char operators: <=, >=, !=
535            if i + 1 < len && is_two_char_operator(ch, bytes[i + 1]) {
536                i += 2;
537            } else {
538                i += 1;
539            }
540            tokens.push(Token {
541                kind: TokenKind::Operator,
542                text: input[start..i].to_string(),
543                start,
544                end: i,
545            });
546            continue;
547        }
548
549        // Punctuation
550        if is_punctuation(ch) {
551            let start = i;
552            i += 1;
553            let text = input[start..i].to_string();
554
555            // Track dot for qualified names
556            if ch == b'.' {
557                ctx = GrammarContext::ExpectQualifiedPart;
558            }
559
560            tokens.push(Token {
561                kind: TokenKind::Punctuation,
562                text,
563                start,
564                end: i,
565            });
566            continue;
567        }
568
569        // Unknown character (advance by UTF-8 char length)
570        let start = i;
571        let clen = char_len_at(bytes, i);
572        i += clen;
573        tokens.push(Token {
574            kind: TokenKind::Unknown,
575            text: input[start..i].to_string(),
576            start,
577            end: i,
578        });
579    }
580
581    tokens
582}
583
584/// Get the grammar context at the end of the given input.
585/// Useful for tab completion to know what kind of token is expected next.
586pub fn grammar_context_at_end(input: &str) -> GrammarContext {
587    let tokens = tokenize(input);
588    context_from_tokens(&tokens)
589}
590
591/// Derive grammar context from a token sequence (skipping whitespace/comments).
592pub fn context_from_tokens(tokens: &[Token]) -> GrammarContext {
593    // Walk backwards through significant tokens to determine context
594    let significant: Vec<&Token> = tokens
595        .iter()
596        .filter(|t| {
597            !matches!(
598                t.kind,
599                TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment
600            )
601        })
602        .collect();
603
604    if significant.is_empty() {
605        return GrammarContext::Start;
606    }
607
608    let last = significant.last().unwrap();
609
610    // Check last keyword for context
611    if last.kind == TokenKind::Keyword || last.kind == TokenKind::Identifier {
612        let upper = last.text.to_uppercase();
613        match upper.as_str() {
614            "SELECT" | "DISTINCT" => return GrammarContext::ExpectColumnList,
615            "FROM" | "INTO" | "UPDATE" | "TABLE" | "TRUNCATE" => {
616                return GrammarContext::ExpectTable
617            }
618            "USE" | "KEYSPACE" | "KEYSPACES" => return GrammarContext::ExpectKeyspace,
619            "WHERE" | "IF" => return GrammarContext::ExpectColumn,
620            "AND" => {
621                // AND after WHERE/IF is column context
622                if has_keyword_before(&significant, &["WHERE", "IF"]) {
623                    return GrammarContext::ExpectColumn;
624                }
625                return GrammarContext::General;
626            }
627            "SET" => {
628                // SET after UPDATE is column assignment; SET as a type is different
629                if has_keyword_before(&significant, &["UPDATE"]) {
630                    return GrammarContext::ExpectSetClause;
631                }
632                return GrammarContext::General;
633            }
634            "ORDER" => return GrammarContext::ExpectOrderBy,
635            "BY" => {
636                if significant.len() >= 2
637                    && significant[significant.len() - 2].text.to_uppercase() == "ORDER"
638                {
639                    return GrammarContext::ExpectOrderByColumn;
640                }
641                return GrammarContext::General;
642            }
643            "VALUES" => return GrammarContext::ExpectValues,
644            "WITH" => return GrammarContext::ExpectWithOption,
645            "CONSISTENCY" => return GrammarContext::ExpectConsistencyLevel,
646            "DESCRIBE" | "DESC" => return GrammarContext::ExpectDescribeTarget,
647            "SOURCE" | "CAPTURE" => return GrammarContext::ExpectFilePath,
648            "ON" => {
649                // INDEX ... ON -> expect table
650                if has_keyword_before(&significant, &["INDEX"]) {
651                    return GrammarContext::ExpectTable;
652                }
653                return GrammarContext::General;
654            }
655            "INDEX" => {
656                // After CREATE/DROP INDEX -> expect index name (identifier)
657                return GrammarContext::General;
658            }
659            _ => {}
660        }
661    }
662
663    if last.kind == TokenKind::Punctuation && last.text == "." {
664        return GrammarContext::ExpectQualifiedPart;
665    }
666
667    // After FROM table_name — we're in general clause context
668    // Check if two tokens back is a table-expecting keyword
669    if significant.len() >= 2 {
670        let second_last = significant[significant.len() - 2];
671        let sl_upper = second_last.text.to_uppercase();
672        if matches!(
673            sl_upper.as_str(),
674            "FROM" | "INTO" | "UPDATE" | "TABLE" | "TRUNCATE"
675        ) {
676            return GrammarContext::General;
677        }
678        // After SERIAL -> if next is CONSISTENCY
679        if sl_upper == "SERIAL" && last.text.to_uppercase() == "CONSISTENCY" {
680            return GrammarContext::ExpectConsistencyLevel;
681        }
682    }
683
684    GrammarContext::General
685}
686
687/// Contexts where the next word is always an identifier (a name), regardless
688/// of whether it matches a keyword. E.g., after FROM the next word is a table name.
689fn is_strict_identifier_context(ctx: GrammarContext) -> bool {
690    matches!(
691        ctx,
692        GrammarContext::ExpectTable
693            | GrammarContext::ExpectKeyspace
694            | GrammarContext::ExpectColumn
695            | GrammarContext::ExpectQualifiedPart
696            | GrammarContext::ExpectOrderByColumn
697            | GrammarContext::ExpectSetClause
698            | GrammarContext::ExpectConsistencyLevel
699    )
700}
701
702/// Keywords that remain keywords inside a SELECT column list.
703/// These are clause-level keywords that terminate or modify the column list.
704const COLUMN_LIST_KEYWORDS: &[&str] = &["AS", "DISTINCT", "FROM", "JSON"];
705
706fn is_column_list_keyword(word: &str) -> bool {
707    let upper = word.to_uppercase();
708    COLUMN_LIST_KEYWORDS.contains(&upper.as_str())
709}
710
711/// Classify a word as keyword, boolean, or identifier based on grammar context.
712fn classify_word(word: &str, ctx: GrammarContext) -> TokenKind {
713    let upper = word.to_uppercase();
714
715    // Boolean literals
716    if upper == "TRUE" || upper == "FALSE" {
717        return TokenKind::BooleanLiteral;
718    }
719
720    // In strict identifier contexts, EVERYTHING is an identifier
721    if is_strict_identifier_context(ctx) {
722        return TokenKind::Identifier;
723    }
724
725    // In column list context, only specific keywords remain keywords
726    if ctx == GrammarContext::ExpectColumnList {
727        if is_column_list_keyword(word) {
728            return TokenKind::Keyword;
729        }
730        return TokenKind::Identifier;
731    }
732
733    // NULL is a keyword-like value
734    if upper == "NULL" {
735        return TokenKind::Keyword;
736    }
737
738    if is_cql_keyword(&upper) {
739        TokenKind::Keyword
740    } else {
741        TokenKind::Identifier
742    }
743}
744
745/// Advance the grammar context after seeing a word token.
746fn advance_context_after_word(word: &str, ctx: GrammarContext) -> GrammarContext {
747    let upper = word.to_uppercase();
748
749    match upper.as_str() {
750        "SELECT" => GrammarContext::ExpectColumnList,
751        "DISTINCT" if ctx == GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
752        "FROM" => GrammarContext::ExpectTable,
753        "INTO" => GrammarContext::ExpectTable,
754        "UPDATE" => GrammarContext::ExpectTable,
755        "TABLE" => GrammarContext::ExpectTable,
756        "TRUNCATE" => GrammarContext::ExpectTable,
757        "USE" => GrammarContext::ExpectKeyspace,
758        "KEYSPACE" => GrammarContext::ExpectKeyspace,
759        "WHERE" => GrammarContext::ExpectColumn,
760        "AND" => {
761            // Preserve column context if we're in a WHERE/SET clause
762            match ctx {
763                GrammarContext::ExpectColumn | GrammarContext::General => {
764                    // After WHERE col = val AND -> next column
765                    GrammarContext::ExpectColumn
766                }
767                _ => GrammarContext::General,
768            }
769        }
770        "SET" => {
771            // After UPDATE table SET -> column assignment
772            match ctx {
773                GrammarContext::General => GrammarContext::ExpectSetClause,
774                _ => GrammarContext::General,
775            }
776        }
777        "ORDER" => GrammarContext::ExpectOrderBy,
778        "BY" => {
779            if ctx == GrammarContext::ExpectOrderBy {
780                GrammarContext::ExpectOrderByColumn
781            } else {
782                GrammarContext::General
783            }
784        }
785        "VALUES" => GrammarContext::ExpectValues,
786        "WITH" => GrammarContext::ExpectWithOption,
787        "ON" => {
788            // Could be INDEX ... ON table
789            GrammarContext::ExpectTable
790        }
791        "CONSISTENCY" => GrammarContext::ExpectConsistencyLevel,
792        "DESCRIBE" | "DESC" => GrammarContext::ExpectDescribeTarget,
793        "SOURCE" | "CAPTURE" => GrammarContext::ExpectFilePath,
794        "INSERT" => GrammarContext::General, // INSERT INTO -> INTO will set ExpectTable
795        "DELETE" => GrammarContext::General, // DELETE FROM -> FROM will set ExpectTable
796        "CREATE" | "ALTER" | "DROP" => GrammarContext::General,
797        "IF" => GrammarContext::ExpectColumn,
798        "LIMIT" => GrammarContext::General,
799        _ => {
800            // After an identifier in table/keyspace/column position, go to General
801            match ctx {
802                GrammarContext::ExpectTable
803                | GrammarContext::ExpectKeyspace
804                | GrammarContext::ExpectColumn
805                | GrammarContext::ExpectOrderByColumn
806                | GrammarContext::ExpectQualifiedPart
807                | GrammarContext::ExpectDescribeTarget => GrammarContext::General,
808                GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList, // stay in column list
809                GrammarContext::ExpectSetClause => GrammarContext::ExpectSetClause,
810                other => other,
811            }
812        }
813    }
814}
815
816/// Advance context after a value (string literal, number, etc.).
817fn advance_context_after_value(ctx: GrammarContext) -> GrammarContext {
818    match ctx {
819        GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
820        _ => GrammarContext::General,
821    }
822}
823
824/// Advance context after a name (quoted identifier, etc.).
825fn advance_context_after_name(ctx: GrammarContext) -> GrammarContext {
826    match ctx {
827        GrammarContext::ExpectTable
828        | GrammarContext::ExpectKeyspace
829        | GrammarContext::ExpectColumn
830        | GrammarContext::ExpectQualifiedPart
831        | GrammarContext::ExpectOrderByColumn
832        | GrammarContext::ExpectDescribeTarget => GrammarContext::General,
833        GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
834        other => other,
835    }
836}
837
838/// Check if any of the given keywords appear earlier in the significant token list.
839fn has_keyword_before(significant: &[&Token], keywords: &[&str]) -> bool {
840    significant.iter().rev().skip(1).any(|t| {
841        let upper = t.text.to_uppercase();
842        keywords.contains(&upper.as_str())
843    })
844}
845
846/// Determine if a '-' is in a position where it could be a negative number sign
847/// (after operator, punctuation, or at start).
848fn is_number_sign_position(tokens: &[Token]) -> bool {
849    match tokens.last() {
850        None => true,
851        Some(t) => matches!(
852            t.kind,
853            TokenKind::Operator
854                | TokenKind::Punctuation
855                | TokenKind::Keyword
856                | TokenKind::Whitespace
857        ),
858    }
859}
860
861/// Check if the text from `start` to `num_end` followed by '-' looks like the
862/// beginning of a UUID (8 hex digits).
863fn looks_like_uuid(input: &str, start: usize, num_end: usize) -> bool {
864    let segment = &input[start..num_end];
865    segment.len() == 8 && segment.chars().all(|c| c.is_ascii_hexdigit())
866}
867
868/// Scan a UUID pattern: 8-4-4-4-12 hex digits with dashes.
869/// Returns the end position if valid, or `start` if not.
870fn scan_uuid(input: &str, start: usize) -> usize {
871    let expected_segments = [8, 4, 4, 4, 12];
872    let bytes = input.as_bytes();
873    let len = bytes.len();
874    let mut pos = start;
875
876    for (seg_idx, &seg_len) in expected_segments.iter().enumerate() {
877        if seg_idx > 0 {
878            if pos >= len || bytes[pos] != b'-' {
879                return start;
880            }
881            pos += 1;
882        }
883        let seg_start = pos;
884        while pos < len && bytes[pos].is_ascii_hexdigit() {
885            pos += 1;
886        }
887        if pos - seg_start != seg_len {
888            return start;
889        }
890    }
891
892    // Make sure UUID is not followed by alphanumeric (would be part of a longer token)
893    if pos < len && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_') {
894        return start;
895    }
896
897    pos
898}
899
900fn is_operator_char(ch: u8) -> bool {
901    matches!(ch, b'=' | b'<' | b'>' | b'!' | b'+' | b'%')
902}
903
904fn is_two_char_operator(first: u8, second: u8) -> bool {
905    matches!((first, second), (b'<', b'=') | (b'>', b'=') | (b'!', b'='))
906}
907
908fn is_punctuation(ch: u8) -> bool {
909    matches!(
910        ch,
911        b';' | b',' | b'(' | b')' | b'.' | b'*' | b'?' | b'{' | b'}' | b'[' | b']' | b':'
912    )
913}
914
915/// Get the UTF-8 byte length of the char starting at position `i`.
916fn char_len_at(bytes: &[u8], i: usize) -> usize {
917    if i >= bytes.len() {
918        return 1;
919    }
920    let b = bytes[i];
921    if b < 0x80 {
922        1
923    } else if b < 0xE0 {
924        2
925    } else if b < 0xF0 {
926        3
927    } else {
928        4
929    }
930}
931
932/// Extract only the significant (non-whitespace, non-comment) tokens.
933pub fn significant_tokens(tokens: &[Token]) -> Vec<&Token> {
934    tokens
935        .iter()
936        .filter(|t| {
937            !matches!(
938                t.kind,
939                TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment
940            )
941        })
942        .collect()
943}
944
945/// Strip comments from CQL input, replacing block comments with a space
946/// and removing line comments (preserving newlines).
947///
948/// This is a zero-allocation single-pass scanner (no `Vec<Token>` intermediary).
949/// Used by the parser on every `feed_line` call, so performance matters.
950pub fn strip_comments(input: &str) -> String {
951    let mut result = String::with_capacity(input.len());
952    let bytes = input.as_bytes();
953    let len = bytes.len();
954    let mut i = 0;
955
956    while i < len {
957        let ch = bytes[i];
958
959        // Line comment: -- to end of line
960        if ch == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
961            i += 2;
962            while i < len && bytes[i] != b'\n' {
963                i += 1;
964            }
965            // newline itself is NOT part of the comment; the loop will pick it up
966            continue;
967        }
968
969        // Block comment: /* ... */ (nested)
970        if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
971            let mut depth: usize = 1;
972            i += 2;
973            while i < len && depth > 0 {
974                if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
975                    depth += 1;
976                    i += 2;
977                } else if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
978                    depth -= 1;
979                    i += 2;
980                } else {
981                    i += 1;
982                }
983            }
984            result.push(' '); // replace block comment with space to avoid token merging
985            continue;
986        }
987
988        // Single-quoted string: skip through, copying verbatim
989        if ch == b'\'' {
990            let start = i;
991            i += 1;
992            loop {
993                if i >= len {
994                    break;
995                }
996                if bytes[i] == b'\'' {
997                    i += 1;
998                    if i < len && bytes[i] == b'\'' {
999                        i += 1; // escaped ''
1000                        continue;
1001                    }
1002                    break;
1003                }
1004                i += char_len_at(bytes, i);
1005            }
1006            result.push_str(&input[start..i]);
1007            continue;
1008        }
1009
1010        // Double-quoted identifier: skip through, copying verbatim
1011        if ch == b'"' {
1012            let start = i;
1013            i += 1;
1014            loop {
1015                if i >= len {
1016                    break;
1017                }
1018                if bytes[i] == b'"' {
1019                    i += 1;
1020                    if i < len && bytes[i] == b'"' {
1021                        i += 1; // escaped ""
1022                        continue;
1023                    }
1024                    break;
1025                }
1026                i += char_len_at(bytes, i);
1027            }
1028            result.push_str(&input[start..i]);
1029            continue;
1030        }
1031
1032        // Dollar-quoted string: $$...$$
1033        if ch == b'$' && i + 1 < len && bytes[i + 1] == b'$' {
1034            let start = i;
1035            i += 2;
1036            loop {
1037                if i + 1 >= len {
1038                    i = len; // unterminated
1039                    break;
1040                }
1041                if bytes[i] == b'$' && bytes[i + 1] == b'$' {
1042                    i += 2;
1043                    break;
1044                }
1045                i += 1;
1046            }
1047            result.push_str(&input[start..i]);
1048            continue;
1049        }
1050
1051        // Regular character — copy through
1052        let clen = char_len_at(bytes, i);
1053        result.push_str(&input[i..i + clen]);
1054        i += clen;
1055    }
1056
1057    result
1058}
1059
1060#[cfg(test)]
1061mod tests {
1062    use super::*;
1063
1064    // ===== Helper =====
1065
1066    #[allow(dead_code)]
1067    fn token_kinds(input: &str) -> Vec<TokenKind> {
1068        tokenize(input).into_iter().map(|t| t.kind).collect()
1069    }
1070
1071    fn significant_kinds(input: &str) -> Vec<TokenKind> {
1072        tokenize(input)
1073            .into_iter()
1074            .filter(|t| t.kind != TokenKind::Whitespace)
1075            .map(|t| t.kind)
1076            .collect()
1077    }
1078
1079    fn significant_texts(input: &str) -> Vec<String> {
1080        tokenize(input)
1081            .into_iter()
1082            .filter(|t| t.kind != TokenKind::Whitespace)
1083            .map(|t| t.text)
1084            .collect()
1085    }
1086
1087    // ===== Token Kind Tests =====
1088
1089    #[test]
1090    fn keyword_select() {
1091        let tokens = tokenize("SELECT");
1092        assert_eq!(tokens.len(), 1);
1093        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1094        assert_eq!(tokens[0].text, "SELECT");
1095    }
1096
1097    #[test]
1098    fn keyword_case_insensitive() {
1099        let tokens = tokenize("select");
1100        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1101        assert_eq!(tokens[0].text, "select");
1102    }
1103
1104    #[test]
1105    fn keyword_mixed_case() {
1106        let tokens = tokenize("Select");
1107        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1108    }
1109
1110    #[test]
1111    fn identifier_plain() {
1112        // After FROM, words are identifiers even if they match keywords
1113        let tokens = tokenize("FROM users");
1114        let sig: Vec<_> = tokens
1115            .iter()
1116            .filter(|t| t.kind != TokenKind::Whitespace)
1117            .collect();
1118        assert_eq!(sig[0].kind, TokenKind::Keyword);
1119        assert_eq!(sig[1].kind, TokenKind::Identifier);
1120        assert_eq!(sig[1].text, "users");
1121    }
1122
1123    #[test]
1124    fn identifier_after_from_keyword_name() {
1125        // USERS after FROM should be identifier, not keyword
1126        let tokens = tokenize("SELECT * FROM USERS");
1127        let sig: Vec<_> = tokens
1128            .iter()
1129            .filter(|t| t.kind != TokenKind::Whitespace)
1130            .collect();
1131        assert_eq!(sig[3].text, "USERS");
1132        assert_eq!(sig[3].kind, TokenKind::Identifier);
1133    }
1134
1135    #[test]
1136    fn identifier_key_after_from() {
1137        let tokens = tokenize("SELECT * FROM KEY");
1138        let sig: Vec<_> = tokens
1139            .iter()
1140            .filter(|t| t.kind != TokenKind::Whitespace)
1141            .collect();
1142        assert_eq!(sig[3].text, "KEY");
1143        assert_eq!(sig[3].kind, TokenKind::Identifier);
1144    }
1145
1146    #[test]
1147    fn identifier_set_after_from() {
1148        let tokens = tokenize("SELECT * FROM SET");
1149        let sig: Vec<_> = tokens
1150            .iter()
1151            .filter(|t| t.kind != TokenKind::Whitespace)
1152            .collect();
1153        assert_eq!(sig[3].text, "SET");
1154        assert_eq!(sig[3].kind, TokenKind::Identifier);
1155    }
1156
1157    #[test]
1158    fn identifier_after_into() {
1159        let tokens = tokenize("INSERT INTO my_table");
1160        let sig: Vec<_> = tokens
1161            .iter()
1162            .filter(|t| t.kind != TokenKind::Whitespace)
1163            .collect();
1164        assert_eq!(sig[2].kind, TokenKind::Identifier);
1165    }
1166
1167    #[test]
1168    fn identifier_after_update() {
1169        let tokens = tokenize("UPDATE my_table SET");
1170        let sig: Vec<_> = tokens
1171            .iter()
1172            .filter(|t| t.kind != TokenKind::Whitespace)
1173            .collect();
1174        assert_eq!(sig[1].kind, TokenKind::Identifier);
1175        assert_eq!(sig[1].text, "my_table");
1176    }
1177
1178    #[test]
1179    fn identifier_after_dot() {
1180        let tokens = tokenize("ks.my_table");
1181        let sig: Vec<_> = tokens
1182            .iter()
1183            .filter(|t| t.kind != TokenKind::Whitespace)
1184            .collect();
1185        assert_eq!(sig[0].kind, TokenKind::Identifier); // ks at start is identifier? No, at Start it's not a keyword
1186                                                        // Actually "ks" is not a keyword, so it's Identifier
1187        assert_eq!(sig[1].kind, TokenKind::Punctuation); // .
1188        assert_eq!(sig[2].kind, TokenKind::Identifier); // my_table
1189    }
1190
1191    #[test]
1192    fn keyword_after_dot_is_identifier() {
1193        // SELECT after a dot should be identifier (qualified name)
1194        let tokens = tokenize("FROM ks.SELECT");
1195        let sig: Vec<_> = tokens
1196            .iter()
1197            .filter(|t| t.kind != TokenKind::Whitespace)
1198            .collect();
1199        assert_eq!(sig[2].kind, TokenKind::Punctuation); // .
1200        assert_eq!(sig[3].kind, TokenKind::Identifier); // SELECT as identifier
1201    }
1202
1203    #[test]
1204    fn quoted_identifier() {
1205        let tokens = tokenize("\"MyTable\"");
1206        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1207        assert_eq!(tokens[0].text, "\"MyTable\"");
1208    }
1209
1210    #[test]
1211    fn quoted_identifier_with_escape() {
1212        let tokens = tokenize("\"My\"\"Table\"");
1213        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1214        assert_eq!(tokens[0].text, "\"My\"\"Table\"");
1215    }
1216
1217    #[test]
1218    fn string_literal_simple() {
1219        let tokens = tokenize("'hello'");
1220        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1221        assert_eq!(tokens[0].text, "'hello'");
1222    }
1223
1224    #[test]
1225    fn string_literal_escaped_quote() {
1226        let tokens = tokenize("'it''s'");
1227        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1228        assert_eq!(tokens[0].text, "'it''s'");
1229    }
1230
1231    #[test]
1232    fn string_literal_with_semicolon() {
1233        let tokens = tokenize("'hello;world'");
1234        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1235        assert_eq!(tokens[0].text, "'hello;world'");
1236    }
1237
1238    #[test]
1239    fn string_literal_empty() {
1240        let tokens = tokenize("''");
1241        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1242        assert_eq!(tokens[0].text, "''");
1243    }
1244
1245    #[test]
1246    fn dollar_string_literal() {
1247        let tokens = tokenize("$$hello world$$");
1248        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1249        assert_eq!(tokens[0].text, "$$hello world$$");
1250    }
1251
1252    #[test]
1253    fn dollar_string_with_semicolon() {
1254        let tokens = tokenize("$$a;b$$");
1255        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1256    }
1257
1258    #[test]
1259    fn dollar_string_empty() {
1260        let tokens = tokenize("$$$$");
1261        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1262        assert_eq!(tokens[0].text, "$$$$");
1263    }
1264
1265    #[test]
1266    fn number_integer() {
1267        let tokens = tokenize("42");
1268        assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1269        assert_eq!(tokens[0].text, "42");
1270    }
1271
1272    #[test]
1273    fn number_decimal() {
1274        let tokens = tokenize("3.14");
1275        assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1276        assert_eq!(tokens[0].text, "3.14");
1277    }
1278
1279    #[test]
1280    fn number_negative() {
1281        let tokens = tokenize("= -1");
1282        let sig: Vec<_> = tokens
1283            .iter()
1284            .filter(|t| t.kind != TokenKind::Whitespace)
1285            .collect();
1286        assert_eq!(sig[1].kind, TokenKind::NumberLiteral);
1287        assert_eq!(sig[1].text, "-1");
1288    }
1289
1290    #[test]
1291    fn number_exponent() {
1292        let tokens = tokenize("1.5E10");
1293        assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1294        assert_eq!(tokens[0].text, "1.5E10");
1295    }
1296
1297    #[test]
1298    fn number_not_part_of_identifier() {
1299        let tokens = tokenize("LIMIT 100");
1300        let sig: Vec<_> = tokens
1301            .iter()
1302            .filter(|t| t.kind != TokenKind::Whitespace)
1303            .collect();
1304        assert_eq!(sig[1].kind, TokenKind::NumberLiteral);
1305    }
1306
1307    #[test]
1308    fn blob_literal() {
1309        let tokens = tokenize("0xDEADBEEF");
1310        assert_eq!(tokens[0].kind, TokenKind::BlobLiteral);
1311        assert_eq!(tokens[0].text, "0xDEADBEEF");
1312    }
1313
1314    #[test]
1315    fn blob_literal_lowercase() {
1316        let tokens = tokenize("0xdeadbeef");
1317        assert_eq!(tokens[0].kind, TokenKind::BlobLiteral);
1318    }
1319
1320    #[test]
1321    fn uuid_literal() {
1322        let tokens = tokenize("550e8400-e29b-41d4-a716-446655440000");
1323        assert_eq!(tokens[0].kind, TokenKind::UuidLiteral);
1324    }
1325
1326    #[test]
1327    fn boolean_true() {
1328        let tokens = tokenize("true");
1329        assert_eq!(tokens[0].kind, TokenKind::BooleanLiteral);
1330    }
1331
1332    #[test]
1333    fn boolean_false() {
1334        let tokens = tokenize("FALSE");
1335        assert_eq!(tokens[0].kind, TokenKind::BooleanLiteral);
1336    }
1337
1338    #[test]
1339    fn operator_equals() {
1340        let tokens = tokenize("=");
1341        assert_eq!(tokens[0].kind, TokenKind::Operator);
1342    }
1343
1344    #[test]
1345    fn operator_less_equal() {
1346        let tokens = tokenize("<=");
1347        assert_eq!(tokens[0].kind, TokenKind::Operator);
1348        assert_eq!(tokens[0].text, "<=");
1349    }
1350
1351    #[test]
1352    fn operator_greater_equal() {
1353        let tokens = tokenize(">=");
1354        assert_eq!(tokens[0].kind, TokenKind::Operator);
1355    }
1356
1357    #[test]
1358    fn operator_not_equal() {
1359        let tokens = tokenize("!=");
1360        assert_eq!(tokens[0].kind, TokenKind::Operator);
1361    }
1362
1363    #[test]
1364    fn punctuation_semicolon() {
1365        let tokens = tokenize(";");
1366        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1367    }
1368
1369    #[test]
1370    fn punctuation_comma() {
1371        let tokens = tokenize(",");
1372        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1373    }
1374
1375    #[test]
1376    fn punctuation_parens() {
1377        let kinds = significant_kinds("(x)");
1378        assert_eq!(
1379            kinds,
1380            vec![
1381                TokenKind::Punctuation,
1382                TokenKind::Identifier,
1383                TokenKind::Punctuation
1384            ]
1385        );
1386    }
1387
1388    #[test]
1389    fn punctuation_star() {
1390        let tokens = tokenize("*");
1391        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1392    }
1393
1394    #[test]
1395    fn punctuation_question_mark() {
1396        let tokens = tokenize("?");
1397        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1398    }
1399
1400    #[test]
1401    fn whitespace_space() {
1402        let tokens = tokenize("  ");
1403        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1404    }
1405
1406    #[test]
1407    fn whitespace_tab() {
1408        let tokens = tokenize("\t");
1409        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1410    }
1411
1412    #[test]
1413    fn whitespace_newline() {
1414        let tokens = tokenize("\n");
1415        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1416    }
1417
1418    #[test]
1419    fn line_comment() {
1420        let tokens = tokenize("-- this is a comment");
1421        assert_eq!(tokens[0].kind, TokenKind::LineComment);
1422        assert_eq!(tokens[0].text, "-- this is a comment");
1423    }
1424
1425    #[test]
1426    fn line_comment_stops_at_newline() {
1427        let tokens = tokenize("-- comment\nSELECT");
1428        assert_eq!(tokens[0].kind, TokenKind::LineComment);
1429        assert_eq!(tokens[0].text, "-- comment");
1430        // newline + SELECT follow
1431    }
1432
1433    #[test]
1434    fn block_comment() {
1435        let tokens = tokenize("/* block */");
1436        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1437        assert_eq!(tokens[0].text, "/* block */");
1438    }
1439
1440    #[test]
1441    fn block_comment_nested() {
1442        let tokens = tokenize("/* outer /* inner */ still */");
1443        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1444        assert_eq!(tokens[0].text, "/* outer /* inner */ still */");
1445    }
1446
1447    #[test]
1448    fn block_comment_with_semicolon() {
1449        let tokens = tokenize("/* ; */");
1450        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1451    }
1452
1453    #[test]
1454    fn unknown_char() {
1455        let tokens = tokenize("@");
1456        assert_eq!(tokens[0].kind, TokenKind::Unknown);
1457    }
1458
1459    // ===== Span Tests =====
1460
1461    #[test]
1462    fn spans_are_correct() {
1463        let tokens = tokenize("SELECT *");
1464        assert_eq!(tokens[0].start, 0);
1465        assert_eq!(tokens[0].end, 6);
1466        assert_eq!(tokens[1].start, 6);
1467        assert_eq!(tokens[1].end, 7);
1468        assert_eq!(tokens[2].start, 7);
1469        assert_eq!(tokens[2].end, 8);
1470    }
1471
1472    #[test]
1473    fn spans_cover_full_input() {
1474        let input = "SELECT * FROM users WHERE id = 1;";
1475        let tokens = tokenize(input);
1476        let last = tokens.last().unwrap();
1477        assert_eq!(last.end, input.len());
1478        // Verify no gaps
1479        for window in tokens.windows(2) {
1480            assert_eq!(
1481                window[0].end, window[1].start,
1482                "gap between {:?} and {:?}",
1483                window[0], window[1]
1484            );
1485        }
1486    }
1487
1488    // ===== Full Statement Tests =====
1489
1490    #[test]
1491    fn select_star_from_users() {
1492        let kinds = significant_kinds("SELECT * FROM users");
1493        assert_eq!(
1494            kinds,
1495            vec![
1496                TokenKind::Keyword,
1497                TokenKind::Punctuation,
1498                TokenKind::Keyword,
1499                TokenKind::Identifier
1500            ]
1501        );
1502    }
1503
1504    #[test]
1505    fn select_with_where() {
1506        let kinds = significant_kinds("SELECT name FROM users WHERE id = 1");
1507        assert_eq!(
1508            kinds,
1509            vec![
1510                TokenKind::Keyword,       // SELECT
1511                TokenKind::Identifier,    // name (in column list)
1512                TokenKind::Keyword,       // FROM
1513                TokenKind::Identifier,    // users
1514                TokenKind::Keyword,       // WHERE
1515                TokenKind::Identifier,    // id
1516                TokenKind::Operator,      // =
1517                TokenKind::NumberLiteral, // 1
1518            ]
1519        );
1520    }
1521
1522    #[test]
1523    fn insert_statement() {
1524        let kinds = significant_kinds("INSERT INTO my_table (id, name) VALUES (1, 'hello')");
1525        assert_eq!(
1526            kinds,
1527            vec![
1528                TokenKind::Keyword,       // INSERT
1529                TokenKind::Keyword,       // INTO
1530                TokenKind::Identifier,    // my_table
1531                TokenKind::Punctuation,   // (
1532                TokenKind::Identifier,    // id
1533                TokenKind::Punctuation,   // ,
1534                TokenKind::Identifier,    // name
1535                TokenKind::Punctuation,   // )
1536                TokenKind::Keyword,       // VALUES
1537                TokenKind::Punctuation,   // (
1538                TokenKind::NumberLiteral, // 1
1539                TokenKind::Punctuation,   // ,
1540                TokenKind::StringLiteral, // 'hello'
1541                TokenKind::Punctuation,   // )
1542            ]
1543        );
1544    }
1545
1546    #[test]
1547    fn update_statement() {
1548        let kinds = significant_kinds("UPDATE users SET name = 'Alice' WHERE id = 1");
1549        assert_eq!(
1550            kinds,
1551            vec![
1552                TokenKind::Keyword,       // UPDATE
1553                TokenKind::Identifier,    // users
1554                TokenKind::Keyword,       // SET
1555                TokenKind::Identifier,    // name
1556                TokenKind::Operator,      // =
1557                TokenKind::StringLiteral, // 'Alice'
1558                TokenKind::Keyword,       // WHERE
1559                TokenKind::Identifier,    // id
1560                TokenKind::Operator,      // =
1561                TokenKind::NumberLiteral, // 1
1562            ]
1563        );
1564    }
1565
1566    #[test]
1567    fn create_table() {
1568        let kinds = significant_kinds("CREATE TABLE ks.my_table (id int PRIMARY KEY)");
1569        assert_eq!(
1570            kinds,
1571            vec![
1572                TokenKind::Keyword,     // CREATE
1573                TokenKind::Keyword,     // TABLE
1574                TokenKind::Identifier,  // ks
1575                TokenKind::Punctuation, // .
1576                TokenKind::Identifier,  // my_table
1577                TokenKind::Punctuation, // (
1578                TokenKind::Identifier,  // id
1579                TokenKind::Identifier,  // int (type name, in general context after column name)
1580                TokenKind::Keyword,     // PRIMARY
1581                TokenKind::Keyword,     // KEY
1582                TokenKind::Punctuation, // )
1583            ]
1584        );
1585    }
1586
1587    #[test]
1588    fn use_keyspace() {
1589        let kinds = significant_kinds("USE my_keyspace");
1590        assert_eq!(kinds, vec![TokenKind::Keyword, TokenKind::Identifier]);
1591    }
1592
1593    #[test]
1594    fn qualified_table_name() {
1595        let texts = significant_texts("SELECT * FROM ks.users");
1596        assert_eq!(texts, vec!["SELECT", "*", "FROM", "ks", ".", "users"]);
1597        let kinds = significant_kinds("SELECT * FROM ks.users");
1598        assert_eq!(kinds[3], TokenKind::Identifier); // ks
1599        assert_eq!(kinds[4], TokenKind::Punctuation); // .
1600        assert_eq!(kinds[5], TokenKind::Identifier); // users
1601    }
1602
1603    #[test]
1604    fn statement_with_string_containing_keyword() {
1605        let kinds = significant_kinds("INSERT INTO t (v) VALUES ('SELECT FROM')");
1606        // 'SELECT FROM' should be one StringLiteral, not keywords
1607        assert!(kinds.contains(&TokenKind::StringLiteral));
1608        // Only 3 keywords: INSERT, INTO, VALUES
1609        assert_eq!(
1610            kinds.iter().filter(|k| **k == TokenKind::Keyword).count(),
1611            3
1612        );
1613    }
1614
1615    #[test]
1616    fn statement_with_comment() {
1617        let tokens = tokenize("SELECT 1 -- comment");
1618        let sig: Vec<_> = tokens
1619            .iter()
1620            .filter(|t| t.kind != TokenKind::Whitespace)
1621            .collect();
1622        assert_eq!(sig.len(), 3); // SELECT, 1, comment
1623        assert_eq!(sig[2].kind, TokenKind::LineComment);
1624    }
1625
1626    #[test]
1627    fn statement_with_block_comment() {
1628        let tokens = tokenize("SELECT /* mid */ 1");
1629        let sig: Vec<_> = tokens
1630            .iter()
1631            .filter(|t| t.kind != TokenKind::Whitespace)
1632            .collect();
1633        assert_eq!(sig[0].kind, TokenKind::Keyword); // SELECT
1634        assert_eq!(sig[1].kind, TokenKind::BlockComment); // /* mid */
1635        assert_eq!(sig[2].kind, TokenKind::NumberLiteral); // 1
1636    }
1637
1638    // ===== Grammar Context Tests =====
1639
1640    #[test]
1641    fn context_at_start() {
1642        assert_eq!(grammar_context_at_end(""), GrammarContext::Start);
1643    }
1644
1645    #[test]
1646    fn context_after_select() {
1647        assert_eq!(
1648            grammar_context_at_end("SELECT "),
1649            GrammarContext::ExpectColumnList
1650        );
1651    }
1652
1653    #[test]
1654    fn context_after_from() {
1655        assert_eq!(
1656            grammar_context_at_end("SELECT * FROM "),
1657            GrammarContext::ExpectTable
1658        );
1659    }
1660
1661    #[test]
1662    fn context_after_into() {
1663        assert_eq!(
1664            grammar_context_at_end("INSERT INTO "),
1665            GrammarContext::ExpectTable
1666        );
1667    }
1668
1669    #[test]
1670    fn context_after_update() {
1671        assert_eq!(
1672            grammar_context_at_end("UPDATE "),
1673            GrammarContext::ExpectTable
1674        );
1675    }
1676
1677    #[test]
1678    fn context_after_use() {
1679        assert_eq!(
1680            grammar_context_at_end("USE "),
1681            GrammarContext::ExpectKeyspace
1682        );
1683    }
1684
1685    #[test]
1686    fn context_after_where() {
1687        assert_eq!(
1688            grammar_context_at_end("SELECT * FROM t WHERE "),
1689            GrammarContext::ExpectColumn
1690        );
1691    }
1692
1693    #[test]
1694    fn context_after_dot() {
1695        assert_eq!(
1696            grammar_context_at_end("ks."),
1697            GrammarContext::ExpectQualifiedPart
1698        );
1699    }
1700
1701    #[test]
1702    fn context_after_table_name() {
1703        assert_eq!(
1704            grammar_context_at_end("SELECT * FROM users "),
1705            GrammarContext::General
1706        );
1707    }
1708
1709    #[test]
1710    fn context_after_consistency() {
1711        assert_eq!(
1712            grammar_context_at_end("CONSISTENCY "),
1713            GrammarContext::ExpectConsistencyLevel
1714        );
1715    }
1716
1717    #[test]
1718    fn context_after_describe() {
1719        assert_eq!(
1720            grammar_context_at_end("DESCRIBE "),
1721            GrammarContext::ExpectDescribeTarget
1722        );
1723    }
1724
1725    #[test]
1726    fn context_after_source() {
1727        assert_eq!(
1728            grammar_context_at_end("SOURCE "),
1729            GrammarContext::ExpectFilePath
1730        );
1731    }
1732
1733    #[test]
1734    fn context_after_order_by() {
1735        assert_eq!(
1736            grammar_context_at_end("SELECT * FROM t ORDER BY "),
1737            GrammarContext::ExpectOrderByColumn
1738        );
1739    }
1740
1741    #[test]
1742    fn context_after_values() {
1743        assert_eq!(
1744            grammar_context_at_end("INSERT INTO t (id) VALUES "),
1745            GrammarContext::ExpectValues
1746        );
1747    }
1748
1749    #[test]
1750    fn context_after_with() {
1751        assert_eq!(
1752            grammar_context_at_end("CREATE TABLE t (id int) WITH "),
1753            GrammarContext::ExpectWithOption
1754        );
1755    }
1756
1757    // ===== Edge Cases =====
1758
1759    #[test]
1760    fn empty_input() {
1761        assert!(tokenize("").is_empty());
1762    }
1763
1764    #[test]
1765    fn only_whitespace() {
1766        let tokens = tokenize("   \t\n  ");
1767        assert_eq!(tokens.len(), 1);
1768        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1769    }
1770
1771    #[test]
1772    fn only_comment() {
1773        let tokens = tokenize("-- just a comment");
1774        assert_eq!(tokens.len(), 1);
1775        assert_eq!(tokens[0].kind, TokenKind::LineComment);
1776    }
1777
1778    #[test]
1779    fn unterminated_string() {
1780        let tokens = tokenize("'unterminated");
1781        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1782        assert_eq!(tokens[0].text, "'unterminated");
1783    }
1784
1785    #[test]
1786    fn unterminated_quoted_identifier() {
1787        let tokens = tokenize("\"unterminated");
1788        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1789    }
1790
1791    #[test]
1792    fn unterminated_dollar_string() {
1793        let tokens = tokenize("$$unterminated");
1794        assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1795    }
1796
1797    #[test]
1798    fn unterminated_block_comment() {
1799        let tokens = tokenize("/* unterminated");
1800        assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1801    }
1802
1803    #[test]
1804    fn unicode_in_string() {
1805        let tokens = tokenize("'héllo wörld'");
1806        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1807        assert_eq!(tokens[0].text, "'héllo wörld'");
1808    }
1809
1810    #[test]
1811    fn unicode_in_quoted_identifier() {
1812        let tokens = tokenize("\"naïve\"");
1813        assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1814        assert_eq!(tokens[0].text, "\"naïve\"");
1815    }
1816
1817    #[test]
1818    fn multiple_statements() {
1819        let tokens = tokenize("SELECT 1; SELECT 2;");
1820        let semis: Vec<_> = tokens.iter().filter(|t| t.text == ";").collect();
1821        assert_eq!(semis.len(), 2);
1822    }
1823
1824    #[test]
1825    fn comment_like_in_string() {
1826        let tokens = tokenize("'-- not a comment'");
1827        assert_eq!(tokens.len(), 1);
1828        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1829    }
1830
1831    #[test]
1832    fn block_comment_like_in_string() {
1833        let tokens = tokenize("'/* not a comment */'");
1834        assert_eq!(tokens.len(), 1);
1835        assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1836    }
1837
1838    #[test]
1839    fn negative_number_after_operator() {
1840        let tokens = tokenize("id = -42");
1841        let sig: Vec<_> = tokens
1842            .iter()
1843            .filter(|t| t.kind != TokenKind::Whitespace)
1844            .collect();
1845        assert_eq!(sig[2].kind, TokenKind::NumberLiteral);
1846        assert_eq!(sig[2].text, "-42");
1847    }
1848
1849    #[test]
1850    fn minus_as_operator_after_number() {
1851        let tokens = tokenize("5 - 3");
1852        let sig: Vec<_> = tokens
1853            .iter()
1854            .filter(|t| t.kind != TokenKind::Whitespace)
1855            .collect();
1856        assert_eq!(sig[0].kind, TokenKind::NumberLiteral);
1857        // After a number, - is an operator, not a sign. But the next 3 is a separate number.
1858        // The '-' here: previous token is whitespace (after 5), so it could be sign.
1859        // Actually after NumberLiteral + Whitespace, the '-' is ambiguous.
1860        // Let's just verify we get reasonable output.
1861        assert!(sig.len() >= 3);
1862    }
1863
1864    #[test]
1865    fn blob_after_value_context() {
1866        let tokens = tokenize("INSERT INTO t (b) VALUES (0xDEAD)");
1867        let sig: Vec<_> = tokens
1868            .iter()
1869            .filter(|t| t.kind != TokenKind::Whitespace)
1870            .collect();
1871        let blob = sig.iter().find(|t| t.text == "0xDEAD").unwrap();
1872        assert_eq!(blob.kind, TokenKind::BlobLiteral);
1873    }
1874
1875    #[test]
1876    fn keyword_list_is_sorted() {
1877        for window in CQL_KEYWORDS.windows(2) {
1878            assert!(
1879                window[0] < window[1],
1880                "CQL_KEYWORDS not sorted: {:?} >= {:?}",
1881                window[0],
1882                window[1]
1883            );
1884        }
1885    }
1886
1887    // ===== strip_comments =====
1888
1889    #[test]
1890    fn strip_line_comment() {
1891        let result = strip_comments("SELECT 1 -- comment");
1892        assert_eq!(result, "SELECT 1 ");
1893    }
1894
1895    #[test]
1896    fn strip_block_comment() {
1897        let result = strip_comments("SELECT /* x */ 1");
1898        assert_eq!(result, "SELECT   1");
1899    }
1900
1901    #[test]
1902    fn strip_preserves_strings() {
1903        let result = strip_comments("SELECT '-- not a comment'");
1904        assert_eq!(result, "SELECT '-- not a comment'");
1905    }
1906
1907    #[test]
1908    fn strip_preserves_dollar_strings() {
1909        let result = strip_comments("SELECT $$-- not a comment$$");
1910        assert_eq!(result, "SELECT $$-- not a comment$$");
1911    }
1912
1913    #[test]
1914    fn strip_nested_block_comments() {
1915        let result = strip_comments("SELECT /* outer /* inner */ still */ 1");
1916        assert_eq!(result, "SELECT   1");
1917    }
1918
1919    // ===== is_cql_keyword =====
1920
1921    #[test]
1922    fn keyword_lookup_positive() {
1923        assert!(is_cql_keyword("SELECT"));
1924        assert!(is_cql_keyword("select"));
1925        assert!(is_cql_keyword("From"));
1926        assert!(is_cql_keyword("WHERE"));
1927    }
1928
1929    #[test]
1930    fn keyword_lookup_negative() {
1931        assert!(!is_cql_keyword("my_table"));
1932        assert!(!is_cql_keyword("hello"));
1933        assert!(!is_cql_keyword("xyz"));
1934    }
1935
1936    // ===== significant_tokens helper =====
1937
1938    #[test]
1939    fn significant_tokens_filters_whitespace_and_comments() {
1940        let tokens = tokenize("SELECT /* comment */ * -- line\nFROM t");
1941        let sig = significant_tokens(&tokens);
1942        let kinds: Vec<_> = sig.iter().map(|t| &t.kind).collect();
1943        assert!(!kinds.contains(&&TokenKind::Whitespace));
1944        assert!(!kinds.contains(&&TokenKind::LineComment));
1945        assert!(!kinds.contains(&&TokenKind::BlockComment));
1946    }
1947
1948    // ===== Regression: colorizer false-positive tests =====
1949
1950    #[test]
1951    fn users_not_keyword_after_from() {
1952        let tokens = tokenize("SELECT * FROM users");
1953        let sig: Vec<_> = tokens
1954            .iter()
1955            .filter(|t| t.kind != TokenKind::Whitespace)
1956            .collect();
1957        assert_eq!(sig[3].text, "users");
1958        assert_eq!(sig[3].kind, TokenKind::Identifier);
1959    }
1960
1961    #[test]
1962    fn key_not_keyword_after_from() {
1963        let tokens = tokenize("SELECT key FROM my_table WHERE key = 1");
1964        let sig: Vec<_> = tokens
1965            .iter()
1966            .filter(|t| t.kind != TokenKind::Whitespace)
1967            .collect();
1968        // "key" after SELECT is in column list -> Identifier
1969        assert_eq!(sig[1].kind, TokenKind::Identifier);
1970        // "my_table" after FROM -> Identifier
1971        assert_eq!(sig[3].kind, TokenKind::Identifier);
1972        // "key" after WHERE -> Identifier
1973        assert_eq!(sig[5].kind, TokenKind::Identifier);
1974    }
1975
1976    #[test]
1977    fn set_not_keyword_in_column_list() {
1978        // "set" as a column name in SELECT
1979        let tokens = tokenize("SELECT set FROM my_table");
1980        let sig: Vec<_> = tokens
1981            .iter()
1982            .filter(|t| t.kind != TokenKind::Whitespace)
1983            .collect();
1984        assert_eq!(sig[1].text, "set");
1985        assert_eq!(sig[1].kind, TokenKind::Identifier);
1986    }
1987
1988    #[test]
1989    fn column_names_after_where_are_identifiers() {
1990        let tokens = tokenize("SELECT * FROM t WHERE user = 'test' AND key = 1");
1991        let sig: Vec<_> = tokens
1992            .iter()
1993            .filter(|t| t.kind != TokenKind::Whitespace)
1994            .collect();
1995        // user after WHERE
1996        assert_eq!(sig[5].text, "user");
1997        assert_eq!(sig[5].kind, TokenKind::Identifier);
1998        // key after AND
1999        assert_eq!(sig[9].text, "key");
2000        assert_eq!(sig[9].kind, TokenKind::Identifier);
2001    }
2002
2003    // ===== Complex queries =====
2004
2005    #[test]
2006    fn select_with_function() {
2007        let tokens = tokenize("SELECT count(*) FROM users");
2008        let sig: Vec<_> = tokens
2009            .iter()
2010            .filter(|t| t.kind != TokenKind::Whitespace)
2011            .collect();
2012        assert_eq!(sig[0].kind, TokenKind::Keyword); // SELECT
2013        assert_eq!(sig[1].kind, TokenKind::Identifier); // count (in column list context)
2014    }
2015
2016    #[test]
2017    fn batch_statement() {
2018        let input =
2019            "BEGIN BATCH INSERT INTO t (id) VALUES (1); INSERT INTO t (id) VALUES (2); APPLY BATCH";
2020        let tokens = tokenize(input);
2021        let keywords: Vec<_> = tokens
2022            .iter()
2023            .filter(|t| t.kind == TokenKind::Keyword)
2024            .collect();
2025        assert!(keywords.iter().any(|t| t.text.to_uppercase() == "BEGIN"));
2026        assert!(keywords.iter().any(|t| t.text.to_uppercase() == "BATCH"));
2027        assert!(keywords.iter().any(|t| t.text.to_uppercase() == "APPLY"));
2028    }
2029
2030    #[test]
2031    fn delete_from() {
2032        let kinds = significant_kinds("DELETE FROM users WHERE id = 1");
2033        assert_eq!(kinds[0], TokenKind::Keyword); // DELETE
2034        assert_eq!(kinds[1], TokenKind::Keyword); // FROM
2035        assert_eq!(kinds[2], TokenKind::Identifier); // users
2036    }
2037
2038    #[test]
2039    fn describe_table() {
2040        let kinds = significant_kinds("DESCRIBE TABLE users");
2041        assert_eq!(kinds[0], TokenKind::Keyword); // DESCRIBE
2042        assert_eq!(kinds[1], TokenKind::Keyword); // TABLE
2043        assert_eq!(kinds[2], TokenKind::Identifier); // users
2044    }
2045
2046    #[test]
2047    fn truncate_table() {
2048        let kinds = significant_kinds("TRUNCATE users");
2049        assert_eq!(kinds[0], TokenKind::Keyword); // TRUNCATE
2050        assert_eq!(kinds[1], TokenKind::Identifier); // users
2051    }
2052
2053    #[test]
2054    fn select_distinct() {
2055        let kinds = significant_kinds("SELECT DISTINCT partition_key FROM t");
2056        assert_eq!(kinds[0], TokenKind::Keyword); // SELECT
2057        assert_eq!(kinds[1], TokenKind::Keyword); // DISTINCT
2058        assert_eq!(kinds[2], TokenKind::Identifier); // partition_key
2059    }
2060
2061    #[test]
2062    fn consistency_level() {
2063        let kinds = significant_kinds("CONSISTENCY QUORUM");
2064        assert_eq!(kinds[0], TokenKind::Keyword); // CONSISTENCY
2065        assert_eq!(kinds[1], TokenKind::Identifier); // QUORUM (in consistency level context)
2066    }
2067
2068    #[test]
2069    fn serial_consistency() {
2070        let ctx = grammar_context_at_end("SERIAL CONSISTENCY ");
2071        assert_eq!(ctx, GrammarContext::ExpectConsistencyLevel);
2072    }
2073
2074    #[test]
2075    fn order_by_column() {
2076        let sig: Vec<_> = tokenize("SELECT * FROM t ORDER BY created_at")
2077            .into_iter()
2078            .filter(|t| t.kind != TokenKind::Whitespace)
2079            .collect();
2080        assert_eq!(sig.last().unwrap().kind, TokenKind::Identifier); // created_at
2081    }
2082}
cqlsh_rs/cql_lexer.rs

cqlsh_rs/
cql_lexer.rs