1#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct Token {
13 pub kind: TokenKind,
15 pub text: String,
17 pub start: usize,
19 pub end: usize,
21}
22
23#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum TokenKind {
26 Keyword,
29 Identifier,
31 QuotedIdentifier,
33 StringLiteral,
35 DollarStringLiteral,
37 NumberLiteral,
39 BlobLiteral,
41 UuidLiteral,
43 BooleanLiteral,
45 Operator,
47 Punctuation,
49 Whitespace,
51 LineComment,
53 BlockComment,
55 Unknown,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum GrammarContext {
63 Start,
65 ExpectColumnList,
67 ExpectTable,
69 ExpectKeyspace,
71 ExpectColumn,
73 ExpectSetClause,
75 ExpectQualifiedPart,
77 ExpectType,
79 ExpectOrderBy,
81 ExpectOrderByColumn,
83 ExpectValues,
85 ExpectWithOption,
87 ExpectConsistencyLevel,
89 ExpectDescribeTarget,
91 ExpectFilePath,
93 ExpectCreateTarget,
95 ExpectAlterTarget,
97 ExpectDropTarget,
99 ExpectDeleteTarget,
101 ExpectGrantRevoke,
103 ExpectInsertTarget,
105 ExpectBeginTarget,
107 ExpectSelectPostFrom,
109 ExpectInsertPostValues,
111 ExpectDeletePostFrom,
113 ExpectUpdateClause,
115 ExpectUpdatePostSet,
117 General,
119}
120
121const CQL_KEYWORDS: &[&str] = &[
123 "ADD",
124 "AGGREGATE",
125 "ALL",
126 "ALLOW",
127 "ALTER",
128 "AND",
129 "APPLY",
130 "AS",
131 "ASC",
132 "AUTHORIZE",
133 "BATCH",
134 "BEGIN",
135 "BY",
136 "CALLED",
137 "CAPTURE",
138 "CLEAR",
139 "CLS",
140 "CLUSTERING",
141 "COLUMN",
142 "COMPACT",
143 "CONSISTENCY",
144 "CONTAINS",
145 "COPY",
146 "COUNT",
147 "COUNTER",
148 "CREATE",
149 "CUSTOM",
150 "DELETE",
151 "DESC",
152 "DESCRIBE",
153 "DISTINCT",
154 "DROP",
155 "EACH_QUORUM",
156 "ENTRIES",
157 "EXECUTE",
158 "EXISTS",
159 "EXIT",
160 "EXPAND",
161 "FILTERING",
162 "FINALFUNC",
163 "FROM",
164 "FROZEN",
165 "FULL",
166 "FUNCTION",
167 "FUNCTIONS",
168 "GRANT",
169 "HELP",
170 "IF",
171 "IN",
172 "INDEX",
173 "INITCOND",
174 "INPUT",
175 "INSERT",
176 "INTO",
177 "IS",
178 "JSON",
179 "KEY",
180 "KEYSPACE",
181 "KEYSPACES",
182 "LANGUAGE",
183 "LIKE",
184 "LIMIT",
185 "LIST",
186 "LOCAL_ONE",
187 "LOCAL_QUORUM",
188 "LOGIN",
189 "MAP",
190 "MATERIALIZED",
191 "MODIFY",
192 "NAMESPACE",
193 "NORECURSIVE",
194 "NOT",
195 "NULL",
196 "OF",
197 "ON",
198 "ONE",
199 "OR",
200 "ORDER",
201 "PAGING",
202 "PARTITION",
203 "PASSWORD",
204 "PER",
205 "PERMISSION",
206 "PERMISSIONS",
207 "PRIMARY",
208 "QUIT",
209 "QUORUM",
210 "RENAME",
211 "REPLACE",
212 "RETURNS",
213 "REVOKE",
214 "SCHEMA",
215 "SELECT",
216 "SERIAL",
217 "SET",
218 "SFUNC",
219 "SHOW",
220 "SOURCE",
221 "STATIC",
222 "STORAGE",
223 "STYPE",
224 "SUPERUSER",
225 "TABLE",
226 "TABLES",
227 "TEXT",
228 "THREE",
229 "TIMESTAMP",
230 "TO",
231 "TOKEN",
232 "TRACING",
233 "TRIGGER",
234 "TRUNCATE",
235 "TTL",
236 "TUPLE",
237 "TWO",
238 "TYPE",
239 "UNICODE",
240 "UNLOGGED",
241 "UPDATE",
242 "USE",
243 "USER",
244 "USERS",
245 "USING",
246 "VALUES",
247 "VIEW",
248 "WHERE",
249 "WITH",
250 "WRITETIME",
251];
252
253pub fn is_cql_keyword(word: &str) -> bool {
255 let upper = word.to_uppercase();
256 CQL_KEYWORDS.binary_search(&upper.as_str()).is_ok()
257}
258
259pub fn tokenize(input: &str) -> Vec<Token> {
264 let mut tokens = Vec::new();
265 let mut ctx = GrammarContext::Start;
266 let bytes = input.as_bytes();
267 let len = bytes.len();
268 let mut i = 0;
269
270 while i < len {
271 let ch = bytes[i];
272
273 if ch.is_ascii_whitespace() {
275 let start = i;
276 while i < len && bytes[i].is_ascii_whitespace() {
277 i += 1;
278 }
279 tokens.push(Token {
280 kind: TokenKind::Whitespace,
281 text: input[start..i].to_string(),
282 start,
283 end: i,
284 });
285 continue;
286 }
287
288 if ch == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
290 let start = i;
291 i += 2;
292 while i < len && bytes[i] != b'\n' {
293 i += 1;
294 }
295 tokens.push(Token {
296 kind: TokenKind::LineComment,
297 text: input[start..i].to_string(),
298 start,
299 end: i,
300 });
301 continue;
302 }
303
304 if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
306 let start = i;
307 let mut depth: usize = 1;
308 i += 2;
309 while i < len && depth > 0 {
310 if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
311 depth += 1;
312 i += 2;
313 } else if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
314 depth -= 1;
315 i += 2;
316 } else {
317 i += 1;
318 }
319 }
320 tokens.push(Token {
321 kind: TokenKind::BlockComment,
322 text: input[start..i].to_string(),
323 start,
324 end: i,
325 });
326 continue;
327 }
328
329 if ch == b'\'' {
331 let start = i;
332 i += 1;
333 loop {
334 if i >= len {
335 break; }
337 if bytes[i] == b'\'' {
338 i += 1;
339 if i < len && bytes[i] == b'\'' {
341 i += 1;
342 continue;
343 }
344 break;
345 }
346 i += char_len_at(bytes, i);
348 }
349 tokens.push(Token {
350 kind: TokenKind::StringLiteral,
351 text: input[start..i].to_string(),
352 start,
353 end: i,
354 });
355 ctx = advance_context_after_value(ctx);
356 continue;
357 }
358
359 if ch == b'"' {
361 let start = i;
362 i += 1;
363 loop {
364 if i >= len {
365 break; }
367 if bytes[i] == b'"' {
368 i += 1;
369 if i < len && bytes[i] == b'"' {
371 i += 1;
372 continue;
373 }
374 break;
375 }
376 i += char_len_at(bytes, i);
377 }
378 tokens.push(Token {
379 kind: TokenKind::QuotedIdentifier,
380 text: input[start..i].to_string(),
381 start,
382 end: i,
383 });
384 ctx = advance_context_after_name(ctx);
385 continue;
386 }
387
388 if ch == b'$' && i + 1 < len && bytes[i + 1] == b'$' {
390 let start = i;
391 i += 2;
392 loop {
393 if i + 1 >= len {
394 i = len;
395 break; }
397 if bytes[i] == b'$' && bytes[i + 1] == b'$' {
398 i += 2;
399 break;
400 }
401 i += 1;
402 }
403 tokens.push(Token {
404 kind: TokenKind::DollarStringLiteral,
405 text: input[start..i].to_string(),
406 start,
407 end: i,
408 });
409 ctx = advance_context_after_value(ctx);
410 continue;
411 }
412
413 if ch == b'0' && i + 1 < len && (bytes[i + 1] == b'x' || bytes[i + 1] == b'X') {
415 let start = i;
416 i += 2;
417 while i < len && bytes[i].is_ascii_hexdigit() {
418 i += 1;
419 }
420 if i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
422 i = start;
424 } else {
425 tokens.push(Token {
426 kind: TokenKind::BlobLiteral,
427 text: input[start..i].to_string(),
428 start,
429 end: i,
430 });
431 ctx = advance_context_after_value(ctx);
432 continue;
433 }
434 }
435
436 if ch.is_ascii_digit()
439 || (ch == b'-'
440 && i + 1 < len
441 && bytes[i + 1].is_ascii_digit()
442 && is_number_sign_position(&tokens))
443 {
444 let start = i;
445 if ch == b'-' {
446 i += 1;
447 }
448 while i < len && bytes[i].is_ascii_digit() {
449 i += 1;
450 }
451 if i < len && bytes[i] == b'.' && i + 1 < len && bytes[i + 1].is_ascii_digit() {
453 i += 1;
454 while i < len && bytes[i].is_ascii_digit() {
455 i += 1;
456 }
457 }
458 if i < len && (bytes[i] == b'e' || bytes[i] == b'E') {
460 let save = i;
461 i += 1;
462 if i < len && (bytes[i] == b'+' || bytes[i] == b'-') {
463 i += 1;
464 }
465 if i < len && bytes[i].is_ascii_digit() {
466 while i < len && bytes[i].is_ascii_digit() {
467 i += 1;
468 }
469 } else {
470 i = save; }
472 }
473 if i < len && bytes[i] == b'-' && looks_like_uuid(input, start, i) {
475 let uuid_end = scan_uuid(input, start);
477 if uuid_end > i {
478 i = uuid_end;
479 tokens.push(Token {
480 kind: TokenKind::UuidLiteral,
481 text: input[start..i].to_string(),
482 start,
483 end: i,
484 });
485 ctx = advance_context_after_value(ctx);
486 continue;
487 }
488 }
489 if i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
491 while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
494 i += 1;
495 }
496 let word = &input[start..i];
497 tokens.push(Token {
498 kind: classify_word(word, ctx),
499 text: word.to_string(),
500 start,
501 end: i,
502 });
503 ctx = advance_context_after_word(word, ctx);
504 continue;
505 }
506 tokens.push(Token {
507 kind: TokenKind::NumberLiteral,
508 text: input[start..i].to_string(),
509 start,
510 end: i,
511 });
512 ctx = advance_context_after_value(ctx);
513 continue;
514 }
515
516 if ch.is_ascii_alphabetic() || ch == b'_' {
518 let start = i;
519 while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
520 i += 1;
521 }
522 let word = &input[start..i];
523
524 if i < len
526 && bytes[i] == b'-'
527 && word.len() == 8
528 && word.chars().all(|c| c.is_ascii_hexdigit())
529 {
530 let uuid_end = scan_uuid(input, start);
531 if uuid_end > i {
532 i = uuid_end;
533 tokens.push(Token {
534 kind: TokenKind::UuidLiteral,
535 text: input[start..i].to_string(),
536 start,
537 end: i,
538 });
539 ctx = advance_context_after_value(ctx);
540 continue;
541 }
542 }
543
544 let kind = classify_word(word, ctx);
545 tokens.push(Token {
546 kind,
547 text: word.to_string(),
548 start,
549 end: i,
550 });
551 ctx = advance_context_after_word(word, ctx);
552 continue;
553 }
554
555 if is_operator_char(ch) {
557 let start = i;
558 if i + 1 < len && is_two_char_operator(ch, bytes[i + 1]) {
560 i += 2;
561 } else {
562 i += 1;
563 }
564 tokens.push(Token {
565 kind: TokenKind::Operator,
566 text: input[start..i].to_string(),
567 start,
568 end: i,
569 });
570 continue;
571 }
572
573 if is_punctuation(ch) {
575 let start = i;
576 i += 1;
577 let text = input[start..i].to_string();
578
579 if ch == b'.' {
581 ctx = GrammarContext::ExpectQualifiedPart;
582 }
583
584 tokens.push(Token {
585 kind: TokenKind::Punctuation,
586 text,
587 start,
588 end: i,
589 });
590 continue;
591 }
592
593 let start = i;
595 let clen = char_len_at(bytes, i);
596 i += clen;
597 tokens.push(Token {
598 kind: TokenKind::Unknown,
599 text: input[start..i].to_string(),
600 start,
601 end: i,
602 });
603 }
604
605 tokens
606}
607
608pub fn grammar_context_at_end(input: &str) -> GrammarContext {
611 let tokens = tokenize(input);
612 context_from_tokens(&tokens, input.len())
613}
614
615pub fn context_from_tokens(tokens: &[Token], input_len: usize) -> GrammarContext {
618 let significant: Vec<&Token> = tokens
620 .iter()
621 .filter(|t| {
622 !matches!(
623 t.kind,
624 TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment
625 )
626 })
627 .collect();
628
629 if significant.is_empty() {
630 return GrammarContext::Start;
631 }
632
633 let last = significant.last().unwrap();
634
635 if last.kind == TokenKind::Keyword || last.kind == TokenKind::Identifier {
637 let upper = last.text.to_uppercase();
638 match upper.as_str() {
639 "SELECT" | "DISTINCT" => return GrammarContext::ExpectColumnList,
640 "FROM" | "UPDATE" | "TABLE" | "TRUNCATE" => return GrammarContext::ExpectTable,
641 "INTO" => {
642 if has_keyword_before(&significant, &["INSERT"]) {
643 return GrammarContext::ExpectTable;
644 }
645 return GrammarContext::ExpectTable;
646 }
647 "CREATE" => return GrammarContext::ExpectCreateTarget,
648 "ALTER" => return GrammarContext::ExpectAlterTarget,
649 "DROP" => return GrammarContext::ExpectDropTarget,
650 "DELETE" => return GrammarContext::ExpectDeleteTarget,
651 "GRANT" | "REVOKE" => return GrammarContext::ExpectGrantRevoke,
652 "INSERT" => return GrammarContext::ExpectInsertTarget,
653 "BEGIN" => return GrammarContext::ExpectBeginTarget,
654 "USE" | "KEYSPACE" | "KEYSPACES" => return GrammarContext::ExpectKeyspace,
655 "WHERE" | "IF" => return GrammarContext::ExpectColumn,
656 "AND" => {
657 if has_keyword_before(&significant, &["WHERE", "IF"]) {
659 return GrammarContext::ExpectColumn;
660 }
661 return GrammarContext::General;
662 }
663 "SET" => {
664 if has_keyword_before(&significant, &["UPDATE"]) {
666 return GrammarContext::ExpectSetClause;
667 }
668 return GrammarContext::General;
669 }
670 "ORDER" => return GrammarContext::ExpectOrderBy,
671 "BY" => {
672 if significant.len() >= 2
673 && significant[significant.len() - 2].text.to_uppercase() == "ORDER"
674 {
675 return GrammarContext::ExpectOrderByColumn;
676 }
677 return GrammarContext::General;
678 }
679 "VALUES" => return GrammarContext::ExpectValues,
680 "WITH" => return GrammarContext::ExpectWithOption,
681 "CONSISTENCY" => return GrammarContext::ExpectConsistencyLevel,
682 "DESCRIBE" | "DESC" => return GrammarContext::ExpectDescribeTarget,
683 "SOURCE" | "CAPTURE" => return GrammarContext::ExpectFilePath,
684 "ON" => {
685 if has_keyword_before(&significant, &["INDEX"]) {
687 return GrammarContext::ExpectTable;
688 }
689 return GrammarContext::General;
690 }
691 "INDEX" => {
692 return GrammarContext::General;
694 }
695 _ => {}
696 }
697 }
698
699 if last.kind == TokenKind::Punctuation && last.text == "." {
700 return GrammarContext::ExpectQualifiedPart;
701 }
702
703 if last.kind == TokenKind::Punctuation
705 && last.text == "*"
706 && significant.len() >= 2
707 && significant[significant.len() - 2].text.to_uppercase() == "SELECT"
708 {
709 return GrammarContext::ExpectColumnList;
710 }
711
712 if significant.len() >= 2 {
716 let last_token_end = last.start + last.text.len();
717 let at_end_of_input = last_token_end >= input_len;
718 let second_last = significant[significant.len() - 2];
719 if at_end_of_input && second_last.kind == TokenKind::Punctuation && second_last.text == "."
720 {
721 return GrammarContext::ExpectQualifiedPart;
722 }
723 }
724
725 if significant.len() >= 2 {
728 let keyword_before_table = find_keyword_before_table(&significant);
729 if let Some(kw_upper) = keyword_before_table {
730 match kw_upper.as_str() {
731 "FROM" => {
732 if has_keyword_before(&significant, &["SELECT"]) {
733 return GrammarContext::ExpectSelectPostFrom;
734 }
735 if has_keyword_before(&significant, &["DELETE"]) {
736 return GrammarContext::ExpectDeletePostFrom;
737 }
738 return GrammarContext::General;
739 }
740 "INTO" => {
741 return GrammarContext::General;
742 }
743 "UPDATE" => {
744 return GrammarContext::ExpectUpdateClause;
745 }
746 "TABLE" | "TRUNCATE" => {
747 return GrammarContext::General;
748 }
749 _ => {}
750 }
751 }
752 let second_last = significant[significant.len() - 2];
754 let sl_upper = second_last.text.to_uppercase();
755 if sl_upper == "SERIAL" && last.text.to_uppercase() == "CONSISTENCY" {
756 return GrammarContext::ExpectConsistencyLevel;
757 }
758 }
759
760 if has_keyword_before(&significant, &["UPDATE"]) && has_keyword_before(&significant, &["SET"]) {
762 return GrammarContext::ExpectUpdatePostSet;
763 }
764 if has_keyword_before(&significant, &["INSERT"])
765 && has_keyword_before(&significant, &["VALUES"])
766 {
767 return GrammarContext::ExpectInsertPostValues;
768 }
769 if has_keyword_before(&significant, &["SELECT"]) && has_keyword_before(&significant, &["FROM"])
770 {
771 return GrammarContext::ExpectSelectPostFrom;
772 }
773 if has_keyword_before(&significant, &["DELETE"]) && has_keyword_before(&significant, &["FROM"])
774 {
775 return GrammarContext::ExpectDeletePostFrom;
776 }
777
778 GrammarContext::General
779}
780
781fn is_strict_identifier_context(ctx: GrammarContext) -> bool {
784 matches!(
785 ctx,
786 GrammarContext::ExpectTable
787 | GrammarContext::ExpectKeyspace
788 | GrammarContext::ExpectColumn
789 | GrammarContext::ExpectQualifiedPart
790 | GrammarContext::ExpectOrderByColumn
791 | GrammarContext::ExpectSetClause
792 | GrammarContext::ExpectConsistencyLevel
793 )
794}
795
796const COLUMN_LIST_KEYWORDS: &[&str] = &["AS", "DISTINCT", "FROM", "JSON"];
799
800fn is_column_list_keyword(word: &str) -> bool {
801 let upper = word.to_uppercase();
802 COLUMN_LIST_KEYWORDS.contains(&upper.as_str())
803}
804
805fn classify_word(word: &str, ctx: GrammarContext) -> TokenKind {
807 let upper = word.to_uppercase();
808
809 if upper == "TRUE" || upper == "FALSE" {
811 return TokenKind::BooleanLiteral;
812 }
813
814 if is_strict_identifier_context(ctx) {
816 return TokenKind::Identifier;
817 }
818
819 if ctx == GrammarContext::ExpectColumnList {
821 if is_column_list_keyword(word) {
822 return TokenKind::Keyword;
823 }
824 return TokenKind::Identifier;
825 }
826
827 if upper == "NULL" {
829 return TokenKind::Keyword;
830 }
831
832 if is_cql_keyword(&upper) {
833 TokenKind::Keyword
834 } else {
835 TokenKind::Identifier
836 }
837}
838
839fn advance_context_after_word(word: &str, ctx: GrammarContext) -> GrammarContext {
841 let upper = word.to_uppercase();
842
843 match upper.as_str() {
844 "SELECT" => GrammarContext::ExpectColumnList,
845 "DISTINCT" if ctx == GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
846 "FROM" => GrammarContext::ExpectTable,
847 "INTO" => GrammarContext::ExpectTable,
848 "UPDATE" => GrammarContext::ExpectTable,
849 "TABLE" => GrammarContext::ExpectTable,
850 "TRUNCATE" => GrammarContext::ExpectTable,
851 "USE" => GrammarContext::ExpectKeyspace,
852 "KEYSPACE" => GrammarContext::ExpectKeyspace,
853 "WHERE" => GrammarContext::ExpectColumn,
854 "AND" => {
855 match ctx {
857 GrammarContext::ExpectColumn | GrammarContext::General => {
858 GrammarContext::ExpectColumn
860 }
861 _ => GrammarContext::General,
862 }
863 }
864 "SET" => {
865 match ctx {
867 GrammarContext::General => GrammarContext::ExpectSetClause,
868 _ => GrammarContext::General,
869 }
870 }
871 "ORDER" => GrammarContext::ExpectOrderBy,
872 "BY" => {
873 if ctx == GrammarContext::ExpectOrderBy {
874 GrammarContext::ExpectOrderByColumn
875 } else {
876 GrammarContext::General
877 }
878 }
879 "VALUES" => GrammarContext::ExpectValues,
880 "WITH" => GrammarContext::ExpectWithOption,
881 "ON" => {
882 GrammarContext::ExpectTable
884 }
885 "CONSISTENCY" => GrammarContext::ExpectConsistencyLevel,
886 "DESCRIBE" | "DESC" => GrammarContext::ExpectDescribeTarget,
887 "SOURCE" | "CAPTURE" => GrammarContext::ExpectFilePath,
888 "INSERT" => GrammarContext::General, "DELETE" => GrammarContext::General, "CREATE" | "ALTER" | "DROP" => GrammarContext::General,
891 "IF" => GrammarContext::ExpectColumn,
892 "LIMIT" => GrammarContext::General,
893 _ => {
894 match ctx {
896 GrammarContext::ExpectTable
897 | GrammarContext::ExpectKeyspace
898 | GrammarContext::ExpectColumn
899 | GrammarContext::ExpectOrderByColumn
900 | GrammarContext::ExpectQualifiedPart
901 | GrammarContext::ExpectDescribeTarget => GrammarContext::General,
902 GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList, GrammarContext::ExpectSetClause => GrammarContext::ExpectSetClause,
904 other => other,
905 }
906 }
907 }
908}
909
910fn advance_context_after_value(ctx: GrammarContext) -> GrammarContext {
912 match ctx {
913 GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
914 _ => GrammarContext::General,
915 }
916}
917
918fn advance_context_after_name(ctx: GrammarContext) -> GrammarContext {
920 match ctx {
921 GrammarContext::ExpectTable
922 | GrammarContext::ExpectKeyspace
923 | GrammarContext::ExpectColumn
924 | GrammarContext::ExpectQualifiedPart
925 | GrammarContext::ExpectOrderByColumn
926 | GrammarContext::ExpectDescribeTarget => GrammarContext::General,
927 GrammarContext::ExpectColumnList => GrammarContext::ExpectColumnList,
928 other => other,
929 }
930}
931
932fn find_keyword_before_table(significant: &[&Token]) -> Option<String> {
936 const TABLE_KEYWORDS: &[&str] = &["FROM", "INTO", "UPDATE", "TABLE", "TRUNCATE"];
937 let len = significant.len();
938 if len < 2 {
939 return None;
940 }
941
942 let second_last_upper = significant[len - 2].text.to_uppercase();
944 if TABLE_KEYWORDS.contains(&second_last_upper.as_str()) {
945 return Some(second_last_upper);
946 }
947
948 if len >= 4 && significant[len - 2].text == "." {
950 let kw_upper = significant[len - 4].text.to_uppercase();
951 if TABLE_KEYWORDS.contains(&kw_upper.as_str()) {
952 return Some(kw_upper);
953 }
954 }
955
956 None
957}
958
959fn has_keyword_before(significant: &[&Token], keywords: &[&str]) -> bool {
960 significant.iter().rev().skip(1).any(|t| {
961 let upper = t.text.to_uppercase();
962 keywords.contains(&upper.as_str())
963 })
964}
965
966fn is_number_sign_position(tokens: &[Token]) -> bool {
969 match tokens.last() {
970 None => true,
971 Some(t) => matches!(
972 t.kind,
973 TokenKind::Operator
974 | TokenKind::Punctuation
975 | TokenKind::Keyword
976 | TokenKind::Whitespace
977 ),
978 }
979}
980
981fn looks_like_uuid(input: &str, start: usize, num_end: usize) -> bool {
984 let segment = &input[start..num_end];
985 segment.len() == 8 && segment.chars().all(|c| c.is_ascii_hexdigit())
986}
987
988fn scan_uuid(input: &str, start: usize) -> usize {
991 let expected_segments = [8, 4, 4, 4, 12];
992 let bytes = input.as_bytes();
993 let len = bytes.len();
994 let mut pos = start;
995
996 for (seg_idx, &seg_len) in expected_segments.iter().enumerate() {
997 if seg_idx > 0 {
998 if pos >= len || bytes[pos] != b'-' {
999 return start;
1000 }
1001 pos += 1;
1002 }
1003 let seg_start = pos;
1004 while pos < len && bytes[pos].is_ascii_hexdigit() {
1005 pos += 1;
1006 }
1007 if pos - seg_start != seg_len {
1008 return start;
1009 }
1010 }
1011
1012 if pos < len && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_') {
1014 return start;
1015 }
1016
1017 pos
1018}
1019
1020fn is_operator_char(ch: u8) -> bool {
1021 matches!(ch, b'=' | b'<' | b'>' | b'!' | b'+' | b'%')
1022}
1023
1024fn is_two_char_operator(first: u8, second: u8) -> bool {
1025 matches!((first, second), (b'<', b'=') | (b'>', b'=') | (b'!', b'='))
1026}
1027
1028fn is_punctuation(ch: u8) -> bool {
1029 matches!(
1030 ch,
1031 b';' | b',' | b'(' | b')' | b'.' | b'*' | b'?' | b'{' | b'}' | b'[' | b']' | b':'
1032 )
1033}
1034
1035fn char_len_at(bytes: &[u8], i: usize) -> usize {
1037 if i >= bytes.len() {
1038 return 1;
1039 }
1040 let b = bytes[i];
1041 if b < 0x80 {
1042 1
1043 } else if b < 0xE0 {
1044 2
1045 } else if b < 0xF0 {
1046 3
1047 } else {
1048 4
1049 }
1050}
1051
1052pub fn significant_tokens(tokens: &[Token]) -> Vec<&Token> {
1054 tokens
1055 .iter()
1056 .filter(|t| {
1057 !matches!(
1058 t.kind,
1059 TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment
1060 )
1061 })
1062 .collect()
1063}
1064
1065pub fn strip_comments(input: &str) -> String {
1071 let mut result = String::with_capacity(input.len());
1072 let bytes = input.as_bytes();
1073 let len = bytes.len();
1074 let mut i = 0;
1075
1076 while i < len {
1077 let ch = bytes[i];
1078
1079 if ch == b'-' && i + 1 < len && bytes[i + 1] == b'-' {
1081 i += 2;
1082 while i < len && bytes[i] != b'\n' {
1083 i += 1;
1084 }
1085 continue;
1087 }
1088
1089 if ch == b'/' && i + 1 < len && bytes[i + 1] == b'/' {
1091 i += 2;
1092 while i < len && bytes[i] != b'\n' {
1093 i += 1;
1094 }
1095 continue;
1096 }
1097
1098 if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
1100 let mut depth: usize = 1;
1101 i += 2;
1102 while i < len && depth > 0 {
1103 if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
1104 depth += 1;
1105 i += 2;
1106 } else if bytes[i] == b'*' && i + 1 < len && bytes[i + 1] == b'/' {
1107 depth -= 1;
1108 i += 2;
1109 } else {
1110 i += 1;
1111 }
1112 }
1113 result.push(' '); continue;
1115 }
1116
1117 if ch == b'\'' {
1119 let start = i;
1120 i += 1;
1121 loop {
1122 if i >= len {
1123 break;
1124 }
1125 if bytes[i] == b'\'' {
1126 i += 1;
1127 if i < len && bytes[i] == b'\'' {
1128 i += 1; continue;
1130 }
1131 break;
1132 }
1133 i += char_len_at(bytes, i);
1134 }
1135 result.push_str(&input[start..i]);
1136 continue;
1137 }
1138
1139 if ch == b'"' {
1141 let start = i;
1142 i += 1;
1143 loop {
1144 if i >= len {
1145 break;
1146 }
1147 if bytes[i] == b'"' {
1148 i += 1;
1149 if i < len && bytes[i] == b'"' {
1150 i += 1; continue;
1152 }
1153 break;
1154 }
1155 i += char_len_at(bytes, i);
1156 }
1157 result.push_str(&input[start..i]);
1158 continue;
1159 }
1160
1161 if ch == b'$' && i + 1 < len && bytes[i + 1] == b'$' {
1163 let start = i;
1164 i += 2;
1165 loop {
1166 if i + 1 >= len {
1167 i = len; break;
1169 }
1170 if bytes[i] == b'$' && bytes[i + 1] == b'$' {
1171 i += 2;
1172 break;
1173 }
1174 i += 1;
1175 }
1176 result.push_str(&input[start..i]);
1177 continue;
1178 }
1179
1180 let clen = char_len_at(bytes, i);
1182 result.push_str(&input[i..i + clen]);
1183 i += clen;
1184 }
1185
1186 result
1187}
1188
1189#[cfg(test)]
1190mod tests {
1191 use super::*;
1192
1193 #[allow(dead_code)]
1196 fn token_kinds(input: &str) -> Vec<TokenKind> {
1197 tokenize(input).into_iter().map(|t| t.kind).collect()
1198 }
1199
1200 fn significant_kinds(input: &str) -> Vec<TokenKind> {
1201 tokenize(input)
1202 .into_iter()
1203 .filter(|t| t.kind != TokenKind::Whitespace)
1204 .map(|t| t.kind)
1205 .collect()
1206 }
1207
1208 fn significant_texts(input: &str) -> Vec<String> {
1209 tokenize(input)
1210 .into_iter()
1211 .filter(|t| t.kind != TokenKind::Whitespace)
1212 .map(|t| t.text)
1213 .collect()
1214 }
1215
1216 #[test]
1219 fn keyword_select() {
1220 let tokens = tokenize("SELECT");
1221 assert_eq!(tokens.len(), 1);
1222 assert_eq!(tokens[0].kind, TokenKind::Keyword);
1223 assert_eq!(tokens[0].text, "SELECT");
1224 }
1225
1226 #[test]
1227 fn keyword_case_insensitive() {
1228 let tokens = tokenize("select");
1229 assert_eq!(tokens[0].kind, TokenKind::Keyword);
1230 assert_eq!(tokens[0].text, "select");
1231 }
1232
1233 #[test]
1234 fn keyword_mixed_case() {
1235 let tokens = tokenize("Select");
1236 assert_eq!(tokens[0].kind, TokenKind::Keyword);
1237 }
1238
1239 #[test]
1240 fn identifier_plain() {
1241 let tokens = tokenize("FROM users");
1243 let sig: Vec<_> = tokens
1244 .iter()
1245 .filter(|t| t.kind != TokenKind::Whitespace)
1246 .collect();
1247 assert_eq!(sig[0].kind, TokenKind::Keyword);
1248 assert_eq!(sig[1].kind, TokenKind::Identifier);
1249 assert_eq!(sig[1].text, "users");
1250 }
1251
1252 #[test]
1253 fn identifier_after_from_keyword_name() {
1254 let tokens = tokenize("SELECT * FROM USERS");
1256 let sig: Vec<_> = tokens
1257 .iter()
1258 .filter(|t| t.kind != TokenKind::Whitespace)
1259 .collect();
1260 assert_eq!(sig[3].text, "USERS");
1261 assert_eq!(sig[3].kind, TokenKind::Identifier);
1262 }
1263
1264 #[test]
1265 fn identifier_key_after_from() {
1266 let tokens = tokenize("SELECT * FROM KEY");
1267 let sig: Vec<_> = tokens
1268 .iter()
1269 .filter(|t| t.kind != TokenKind::Whitespace)
1270 .collect();
1271 assert_eq!(sig[3].text, "KEY");
1272 assert_eq!(sig[3].kind, TokenKind::Identifier);
1273 }
1274
1275 #[test]
1276 fn identifier_set_after_from() {
1277 let tokens = tokenize("SELECT * FROM SET");
1278 let sig: Vec<_> = tokens
1279 .iter()
1280 .filter(|t| t.kind != TokenKind::Whitespace)
1281 .collect();
1282 assert_eq!(sig[3].text, "SET");
1283 assert_eq!(sig[3].kind, TokenKind::Identifier);
1284 }
1285
1286 #[test]
1287 fn identifier_after_into() {
1288 let tokens = tokenize("INSERT INTO my_table");
1289 let sig: Vec<_> = tokens
1290 .iter()
1291 .filter(|t| t.kind != TokenKind::Whitespace)
1292 .collect();
1293 assert_eq!(sig[2].kind, TokenKind::Identifier);
1294 }
1295
1296 #[test]
1297 fn identifier_after_update() {
1298 let tokens = tokenize("UPDATE my_table SET");
1299 let sig: Vec<_> = tokens
1300 .iter()
1301 .filter(|t| t.kind != TokenKind::Whitespace)
1302 .collect();
1303 assert_eq!(sig[1].kind, TokenKind::Identifier);
1304 assert_eq!(sig[1].text, "my_table");
1305 }
1306
1307 #[test]
1308 fn identifier_after_dot() {
1309 let tokens = tokenize("ks.my_table");
1310 let sig: Vec<_> = tokens
1311 .iter()
1312 .filter(|t| t.kind != TokenKind::Whitespace)
1313 .collect();
1314 assert_eq!(sig[0].kind, TokenKind::Identifier); assert_eq!(sig[1].kind, TokenKind::Punctuation); assert_eq!(sig[2].kind, TokenKind::Identifier); }
1319
1320 #[test]
1321 fn keyword_after_dot_is_identifier() {
1322 let tokens = tokenize("FROM ks.SELECT");
1324 let sig: Vec<_> = tokens
1325 .iter()
1326 .filter(|t| t.kind != TokenKind::Whitespace)
1327 .collect();
1328 assert_eq!(sig[2].kind, TokenKind::Punctuation); assert_eq!(sig[3].kind, TokenKind::Identifier); }
1331
1332 #[test]
1333 fn quoted_identifier() {
1334 let tokens = tokenize("\"MyTable\"");
1335 assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1336 assert_eq!(tokens[0].text, "\"MyTable\"");
1337 }
1338
1339 #[test]
1340 fn quoted_identifier_with_escape() {
1341 let tokens = tokenize("\"My\"\"Table\"");
1342 assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1343 assert_eq!(tokens[0].text, "\"My\"\"Table\"");
1344 }
1345
1346 #[test]
1347 fn string_literal_simple() {
1348 let tokens = tokenize("'hello'");
1349 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1350 assert_eq!(tokens[0].text, "'hello'");
1351 }
1352
1353 #[test]
1354 fn string_literal_escaped_quote() {
1355 let tokens = tokenize("'it''s'");
1356 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1357 assert_eq!(tokens[0].text, "'it''s'");
1358 }
1359
1360 #[test]
1361 fn string_literal_with_semicolon() {
1362 let tokens = tokenize("'hello;world'");
1363 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1364 assert_eq!(tokens[0].text, "'hello;world'");
1365 }
1366
1367 #[test]
1368 fn string_literal_empty() {
1369 let tokens = tokenize("''");
1370 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1371 assert_eq!(tokens[0].text, "''");
1372 }
1373
1374 #[test]
1375 fn dollar_string_literal() {
1376 let tokens = tokenize("$$hello world$$");
1377 assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1378 assert_eq!(tokens[0].text, "$$hello world$$");
1379 }
1380
1381 #[test]
1382 fn dollar_string_with_semicolon() {
1383 let tokens = tokenize("$$a;b$$");
1384 assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1385 }
1386
1387 #[test]
1388 fn dollar_string_empty() {
1389 let tokens = tokenize("$$$$");
1390 assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1391 assert_eq!(tokens[0].text, "$$$$");
1392 }
1393
1394 #[test]
1395 fn number_integer() {
1396 let tokens = tokenize("42");
1397 assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1398 assert_eq!(tokens[0].text, "42");
1399 }
1400
1401 #[test]
1402 fn number_decimal() {
1403 let tokens = tokenize("3.14");
1404 assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1405 assert_eq!(tokens[0].text, "3.14");
1406 }
1407
1408 #[test]
1409 fn number_negative() {
1410 let tokens = tokenize("= -1");
1411 let sig: Vec<_> = tokens
1412 .iter()
1413 .filter(|t| t.kind != TokenKind::Whitespace)
1414 .collect();
1415 assert_eq!(sig[1].kind, TokenKind::NumberLiteral);
1416 assert_eq!(sig[1].text, "-1");
1417 }
1418
1419 #[test]
1420 fn number_exponent() {
1421 let tokens = tokenize("1.5E10");
1422 assert_eq!(tokens[0].kind, TokenKind::NumberLiteral);
1423 assert_eq!(tokens[0].text, "1.5E10");
1424 }
1425
1426 #[test]
1427 fn number_not_part_of_identifier() {
1428 let tokens = tokenize("LIMIT 100");
1429 let sig: Vec<_> = tokens
1430 .iter()
1431 .filter(|t| t.kind != TokenKind::Whitespace)
1432 .collect();
1433 assert_eq!(sig[1].kind, TokenKind::NumberLiteral);
1434 }
1435
1436 #[test]
1437 fn blob_literal() {
1438 let tokens = tokenize("0xDEADBEEF");
1439 assert_eq!(tokens[0].kind, TokenKind::BlobLiteral);
1440 assert_eq!(tokens[0].text, "0xDEADBEEF");
1441 }
1442
1443 #[test]
1444 fn blob_literal_lowercase() {
1445 let tokens = tokenize("0xdeadbeef");
1446 assert_eq!(tokens[0].kind, TokenKind::BlobLiteral);
1447 }
1448
1449 #[test]
1450 fn uuid_literal() {
1451 let tokens = tokenize("550e8400-e29b-41d4-a716-446655440000");
1452 assert_eq!(tokens[0].kind, TokenKind::UuidLiteral);
1453 }
1454
1455 #[test]
1456 fn boolean_true() {
1457 let tokens = tokenize("true");
1458 assert_eq!(tokens[0].kind, TokenKind::BooleanLiteral);
1459 }
1460
1461 #[test]
1462 fn boolean_false() {
1463 let tokens = tokenize("FALSE");
1464 assert_eq!(tokens[0].kind, TokenKind::BooleanLiteral);
1465 }
1466
1467 #[test]
1468 fn operator_equals() {
1469 let tokens = tokenize("=");
1470 assert_eq!(tokens[0].kind, TokenKind::Operator);
1471 }
1472
1473 #[test]
1474 fn operator_less_equal() {
1475 let tokens = tokenize("<=");
1476 assert_eq!(tokens[0].kind, TokenKind::Operator);
1477 assert_eq!(tokens[0].text, "<=");
1478 }
1479
1480 #[test]
1481 fn operator_greater_equal() {
1482 let tokens = tokenize(">=");
1483 assert_eq!(tokens[0].kind, TokenKind::Operator);
1484 }
1485
1486 #[test]
1487 fn operator_not_equal() {
1488 let tokens = tokenize("!=");
1489 assert_eq!(tokens[0].kind, TokenKind::Operator);
1490 }
1491
1492 #[test]
1493 fn punctuation_semicolon() {
1494 let tokens = tokenize(";");
1495 assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1496 }
1497
1498 #[test]
1499 fn punctuation_comma() {
1500 let tokens = tokenize(",");
1501 assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1502 }
1503
1504 #[test]
1505 fn punctuation_parens() {
1506 let kinds = significant_kinds("(x)");
1507 assert_eq!(
1508 kinds,
1509 vec![
1510 TokenKind::Punctuation,
1511 TokenKind::Identifier,
1512 TokenKind::Punctuation
1513 ]
1514 );
1515 }
1516
1517 #[test]
1518 fn punctuation_star() {
1519 let tokens = tokenize("*");
1520 assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1521 }
1522
1523 #[test]
1524 fn punctuation_question_mark() {
1525 let tokens = tokenize("?");
1526 assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1527 }
1528
1529 #[test]
1530 fn whitespace_space() {
1531 let tokens = tokenize(" ");
1532 assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1533 }
1534
1535 #[test]
1536 fn whitespace_tab() {
1537 let tokens = tokenize("\t");
1538 assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1539 }
1540
1541 #[test]
1542 fn whitespace_newline() {
1543 let tokens = tokenize("\n");
1544 assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1545 }
1546
1547 #[test]
1548 fn line_comment() {
1549 let tokens = tokenize("-- this is a comment");
1550 assert_eq!(tokens[0].kind, TokenKind::LineComment);
1551 assert_eq!(tokens[0].text, "-- this is a comment");
1552 }
1553
1554 #[test]
1555 fn line_comment_stops_at_newline() {
1556 let tokens = tokenize("-- comment\nSELECT");
1557 assert_eq!(tokens[0].kind, TokenKind::LineComment);
1558 assert_eq!(tokens[0].text, "-- comment");
1559 }
1561
1562 #[test]
1563 fn block_comment() {
1564 let tokens = tokenize("/* block */");
1565 assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1566 assert_eq!(tokens[0].text, "/* block */");
1567 }
1568
1569 #[test]
1570 fn block_comment_nested() {
1571 let tokens = tokenize("/* outer /* inner */ still */");
1572 assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1573 assert_eq!(tokens[0].text, "/* outer /* inner */ still */");
1574 }
1575
1576 #[test]
1577 fn block_comment_with_semicolon() {
1578 let tokens = tokenize("/* ; */");
1579 assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1580 }
1581
1582 #[test]
1583 fn unknown_char() {
1584 let tokens = tokenize("@");
1585 assert_eq!(tokens[0].kind, TokenKind::Unknown);
1586 }
1587
1588 #[test]
1591 fn spans_are_correct() {
1592 let tokens = tokenize("SELECT *");
1593 assert_eq!(tokens[0].start, 0);
1594 assert_eq!(tokens[0].end, 6);
1595 assert_eq!(tokens[1].start, 6);
1596 assert_eq!(tokens[1].end, 7);
1597 assert_eq!(tokens[2].start, 7);
1598 assert_eq!(tokens[2].end, 8);
1599 }
1600
1601 #[test]
1602 fn spans_cover_full_input() {
1603 let input = "SELECT * FROM users WHERE id = 1;";
1604 let tokens = tokenize(input);
1605 let last = tokens.last().unwrap();
1606 assert_eq!(last.end, input.len());
1607 for window in tokens.windows(2) {
1609 assert_eq!(
1610 window[0].end, window[1].start,
1611 "gap between {:?} and {:?}",
1612 window[0], window[1]
1613 );
1614 }
1615 }
1616
1617 #[test]
1620 fn select_star_from_users() {
1621 let kinds = significant_kinds("SELECT * FROM users");
1622 assert_eq!(
1623 kinds,
1624 vec![
1625 TokenKind::Keyword,
1626 TokenKind::Punctuation,
1627 TokenKind::Keyword,
1628 TokenKind::Identifier
1629 ]
1630 );
1631 }
1632
1633 #[test]
1634 fn select_with_where() {
1635 let kinds = significant_kinds("SELECT name FROM users WHERE id = 1");
1636 assert_eq!(
1637 kinds,
1638 vec![
1639 TokenKind::Keyword, TokenKind::Identifier, TokenKind::Keyword, TokenKind::Identifier, TokenKind::Keyword, TokenKind::Identifier, TokenKind::Operator, TokenKind::NumberLiteral, ]
1648 );
1649 }
1650
1651 #[test]
1652 fn insert_statement() {
1653 let kinds = significant_kinds("INSERT INTO my_table (id, name) VALUES (1, 'hello')");
1654 assert_eq!(
1655 kinds,
1656 vec![
1657 TokenKind::Keyword, TokenKind::Keyword, TokenKind::Identifier, TokenKind::Punctuation, TokenKind::Identifier, TokenKind::Punctuation, TokenKind::Identifier, TokenKind::Punctuation, TokenKind::Keyword, TokenKind::Punctuation, TokenKind::NumberLiteral, TokenKind::Punctuation, TokenKind::StringLiteral, TokenKind::Punctuation, ]
1672 );
1673 }
1674
1675 #[test]
1676 fn update_statement() {
1677 let kinds = significant_kinds("UPDATE users SET name = 'Alice' WHERE id = 1");
1678 assert_eq!(
1679 kinds,
1680 vec![
1681 TokenKind::Keyword, TokenKind::Identifier, TokenKind::Keyword, TokenKind::Identifier, TokenKind::Operator, TokenKind::StringLiteral, TokenKind::Keyword, TokenKind::Identifier, TokenKind::Operator, TokenKind::NumberLiteral, ]
1692 );
1693 }
1694
1695 #[test]
1696 fn create_table() {
1697 let kinds = significant_kinds("CREATE TABLE ks.my_table (id int PRIMARY KEY)");
1698 assert_eq!(
1699 kinds,
1700 vec![
1701 TokenKind::Keyword, TokenKind::Keyword, TokenKind::Identifier, TokenKind::Punctuation, TokenKind::Identifier, TokenKind::Punctuation, TokenKind::Identifier, TokenKind::Identifier, TokenKind::Keyword, TokenKind::Keyword, TokenKind::Punctuation, ]
1713 );
1714 }
1715
1716 #[test]
1717 fn use_keyspace() {
1718 let kinds = significant_kinds("USE my_keyspace");
1719 assert_eq!(kinds, vec![TokenKind::Keyword, TokenKind::Identifier]);
1720 }
1721
1722 #[test]
1723 fn qualified_table_name() {
1724 let texts = significant_texts("SELECT * FROM ks.users");
1725 assert_eq!(texts, vec!["SELECT", "*", "FROM", "ks", ".", "users"]);
1726 let kinds = significant_kinds("SELECT * FROM ks.users");
1727 assert_eq!(kinds[3], TokenKind::Identifier); assert_eq!(kinds[4], TokenKind::Punctuation); assert_eq!(kinds[5], TokenKind::Identifier); }
1731
1732 #[test]
1733 fn statement_with_string_containing_keyword() {
1734 let kinds = significant_kinds("INSERT INTO t (v) VALUES ('SELECT FROM')");
1735 assert!(kinds.contains(&TokenKind::StringLiteral));
1737 assert_eq!(
1739 kinds.iter().filter(|k| **k == TokenKind::Keyword).count(),
1740 3
1741 );
1742 }
1743
1744 #[test]
1745 fn statement_with_comment() {
1746 let tokens = tokenize("SELECT 1 -- comment");
1747 let sig: Vec<_> = tokens
1748 .iter()
1749 .filter(|t| t.kind != TokenKind::Whitespace)
1750 .collect();
1751 assert_eq!(sig.len(), 3); assert_eq!(sig[2].kind, TokenKind::LineComment);
1753 }
1754
1755 #[test]
1756 fn statement_with_block_comment() {
1757 let tokens = tokenize("SELECT /* mid */ 1");
1758 let sig: Vec<_> = tokens
1759 .iter()
1760 .filter(|t| t.kind != TokenKind::Whitespace)
1761 .collect();
1762 assert_eq!(sig[0].kind, TokenKind::Keyword); assert_eq!(sig[1].kind, TokenKind::BlockComment); assert_eq!(sig[2].kind, TokenKind::NumberLiteral); }
1766
1767 #[test]
1770 fn context_at_start() {
1771 assert_eq!(grammar_context_at_end(""), GrammarContext::Start);
1772 }
1773
1774 #[test]
1775 fn context_after_select() {
1776 assert_eq!(
1777 grammar_context_at_end("SELECT "),
1778 GrammarContext::ExpectColumnList
1779 );
1780 }
1781
1782 #[test]
1783 fn context_after_from() {
1784 assert_eq!(
1785 grammar_context_at_end("SELECT * FROM "),
1786 GrammarContext::ExpectTable
1787 );
1788 }
1789
1790 #[test]
1791 fn context_after_into() {
1792 assert_eq!(
1793 grammar_context_at_end("INSERT INTO "),
1794 GrammarContext::ExpectTable
1795 );
1796 }
1797
1798 #[test]
1799 fn context_after_update() {
1800 assert_eq!(
1801 grammar_context_at_end("UPDATE "),
1802 GrammarContext::ExpectTable
1803 );
1804 }
1805
1806 #[test]
1807 fn context_after_use() {
1808 assert_eq!(
1809 grammar_context_at_end("USE "),
1810 GrammarContext::ExpectKeyspace
1811 );
1812 }
1813
1814 #[test]
1815 fn context_after_where() {
1816 assert_eq!(
1817 grammar_context_at_end("SELECT * FROM t WHERE "),
1818 GrammarContext::ExpectColumn
1819 );
1820 }
1821
1822 #[test]
1823 fn context_after_dot() {
1824 assert_eq!(
1825 grammar_context_at_end("ks."),
1826 GrammarContext::ExpectQualifiedPart
1827 );
1828 }
1829
1830 #[test]
1831 fn context_after_table_name() {
1832 assert_eq!(
1833 grammar_context_at_end("SELECT * FROM users "),
1834 GrammarContext::ExpectSelectPostFrom
1835 );
1836 }
1837
1838 #[test]
1839 fn context_after_consistency() {
1840 assert_eq!(
1841 grammar_context_at_end("CONSISTENCY "),
1842 GrammarContext::ExpectConsistencyLevel
1843 );
1844 }
1845
1846 #[test]
1847 fn context_after_describe() {
1848 assert_eq!(
1849 grammar_context_at_end("DESCRIBE "),
1850 GrammarContext::ExpectDescribeTarget
1851 );
1852 }
1853
1854 #[test]
1855 fn context_after_source() {
1856 assert_eq!(
1857 grammar_context_at_end("SOURCE "),
1858 GrammarContext::ExpectFilePath
1859 );
1860 }
1861
1862 #[test]
1863 fn context_after_order_by() {
1864 assert_eq!(
1865 grammar_context_at_end("SELECT * FROM t ORDER BY "),
1866 GrammarContext::ExpectOrderByColumn
1867 );
1868 }
1869
1870 #[test]
1871 fn context_after_values() {
1872 assert_eq!(
1873 grammar_context_at_end("INSERT INTO t (id) VALUES "),
1874 GrammarContext::ExpectValues
1875 );
1876 }
1877
1878 #[test]
1879 fn context_after_with() {
1880 assert_eq!(
1881 grammar_context_at_end("CREATE TABLE t (id int) WITH "),
1882 GrammarContext::ExpectWithOption
1883 );
1884 }
1885
1886 #[test]
1889 fn empty_input() {
1890 assert!(tokenize("").is_empty());
1891 }
1892
1893 #[test]
1894 fn only_whitespace() {
1895 let tokens = tokenize(" \t\n ");
1896 assert_eq!(tokens.len(), 1);
1897 assert_eq!(tokens[0].kind, TokenKind::Whitespace);
1898 }
1899
1900 #[test]
1901 fn only_comment() {
1902 let tokens = tokenize("-- just a comment");
1903 assert_eq!(tokens.len(), 1);
1904 assert_eq!(tokens[0].kind, TokenKind::LineComment);
1905 }
1906
1907 #[test]
1908 fn unterminated_string() {
1909 let tokens = tokenize("'unterminated");
1910 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1911 assert_eq!(tokens[0].text, "'unterminated");
1912 }
1913
1914 #[test]
1915 fn unterminated_quoted_identifier() {
1916 let tokens = tokenize("\"unterminated");
1917 assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1918 }
1919
1920 #[test]
1921 fn unterminated_dollar_string() {
1922 let tokens = tokenize("$$unterminated");
1923 assert_eq!(tokens[0].kind, TokenKind::DollarStringLiteral);
1924 }
1925
1926 #[test]
1927 fn unterminated_block_comment() {
1928 let tokens = tokenize("/* unterminated");
1929 assert_eq!(tokens[0].kind, TokenKind::BlockComment);
1930 }
1931
1932 #[test]
1933 fn unicode_in_string() {
1934 let tokens = tokenize("'héllo wörld'");
1935 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1936 assert_eq!(tokens[0].text, "'héllo wörld'");
1937 }
1938
1939 #[test]
1940 fn unicode_in_quoted_identifier() {
1941 let tokens = tokenize("\"naïve\"");
1942 assert_eq!(tokens[0].kind, TokenKind::QuotedIdentifier);
1943 assert_eq!(tokens[0].text, "\"naïve\"");
1944 }
1945
1946 #[test]
1947 fn multiple_statements() {
1948 let tokens = tokenize("SELECT 1; SELECT 2;");
1949 let semis: Vec<_> = tokens.iter().filter(|t| t.text == ";").collect();
1950 assert_eq!(semis.len(), 2);
1951 }
1952
1953 #[test]
1954 fn comment_like_in_string() {
1955 let tokens = tokenize("'-- not a comment'");
1956 assert_eq!(tokens.len(), 1);
1957 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1958 }
1959
1960 #[test]
1961 fn block_comment_like_in_string() {
1962 let tokens = tokenize("'/* not a comment */'");
1963 assert_eq!(tokens.len(), 1);
1964 assert_eq!(tokens[0].kind, TokenKind::StringLiteral);
1965 }
1966
1967 #[test]
1968 fn negative_number_after_operator() {
1969 let tokens = tokenize("id = -42");
1970 let sig: Vec<_> = tokens
1971 .iter()
1972 .filter(|t| t.kind != TokenKind::Whitespace)
1973 .collect();
1974 assert_eq!(sig[2].kind, TokenKind::NumberLiteral);
1975 assert_eq!(sig[2].text, "-42");
1976 }
1977
1978 #[test]
1979 fn minus_as_operator_after_number() {
1980 let tokens = tokenize("5 - 3");
1981 let sig: Vec<_> = tokens
1982 .iter()
1983 .filter(|t| t.kind != TokenKind::Whitespace)
1984 .collect();
1985 assert_eq!(sig[0].kind, TokenKind::NumberLiteral);
1986 assert!(sig.len() >= 3);
1991 }
1992
1993 #[test]
1994 fn blob_after_value_context() {
1995 let tokens = tokenize("INSERT INTO t (b) VALUES (0xDEAD)");
1996 let sig: Vec<_> = tokens
1997 .iter()
1998 .filter(|t| t.kind != TokenKind::Whitespace)
1999 .collect();
2000 let blob = sig.iter().find(|t| t.text == "0xDEAD").unwrap();
2001 assert_eq!(blob.kind, TokenKind::BlobLiteral);
2002 }
2003
2004 #[test]
2005 fn keyword_list_is_sorted() {
2006 for window in CQL_KEYWORDS.windows(2) {
2007 assert!(
2008 window[0] < window[1],
2009 "CQL_KEYWORDS not sorted: {:?} >= {:?}",
2010 window[0],
2011 window[1]
2012 );
2013 }
2014 }
2015
2016 #[test]
2019 fn strip_line_comment() {
2020 let result = strip_comments("SELECT 1 -- comment");
2021 assert_eq!(result, "SELECT 1 ");
2022 }
2023
2024 #[test]
2025 fn strip_block_comment() {
2026 let result = strip_comments("SELECT /* x */ 1");
2027 assert_eq!(result, "SELECT 1");
2028 }
2029
2030 #[test]
2031 fn strip_preserves_strings() {
2032 let result = strip_comments("SELECT '-- not a comment'");
2033 assert_eq!(result, "SELECT '-- not a comment'");
2034 }
2035
2036 #[test]
2037 fn strip_preserves_dollar_strings() {
2038 let result = strip_comments("SELECT $$-- not a comment$$");
2039 assert_eq!(result, "SELECT $$-- not a comment$$");
2040 }
2041
2042 #[test]
2043 fn strip_nested_block_comments() {
2044 let result = strip_comments("SELECT /* outer /* inner */ still */ 1");
2045 assert_eq!(result, "SELECT 1");
2046 }
2047
2048 #[test]
2049 fn strip_slash_slash_comment() {
2050 let result = strip_comments("SELECT 1 // comment");
2051 assert_eq!(result, "SELECT 1 ");
2052 }
2053
2054 #[test]
2055 fn strip_slash_slash_preserves_strings() {
2056 let result = strip_comments("SELECT '// not a comment'");
2057 assert_eq!(result, "SELECT '// not a comment'");
2058 }
2059
2060 #[test]
2063 fn keyword_lookup_positive() {
2064 assert!(is_cql_keyword("SELECT"));
2065 assert!(is_cql_keyword("select"));
2066 assert!(is_cql_keyword("From"));
2067 assert!(is_cql_keyword("WHERE"));
2068 }
2069
2070 #[test]
2071 fn keyword_lookup_negative() {
2072 assert!(!is_cql_keyword("my_table"));
2073 assert!(!is_cql_keyword("hello"));
2074 assert!(!is_cql_keyword("xyz"));
2075 }
2076
2077 #[test]
2080 fn significant_tokens_filters_whitespace_and_comments() {
2081 let tokens = tokenize("SELECT /* comment */ * -- line\nFROM t");
2082 let sig = significant_tokens(&tokens);
2083 let kinds: Vec<_> = sig.iter().map(|t| &t.kind).collect();
2084 assert!(!kinds.contains(&&TokenKind::Whitespace));
2085 assert!(!kinds.contains(&&TokenKind::LineComment));
2086 assert!(!kinds.contains(&&TokenKind::BlockComment));
2087 }
2088
2089 #[test]
2092 fn users_not_keyword_after_from() {
2093 let tokens = tokenize("SELECT * FROM users");
2094 let sig: Vec<_> = tokens
2095 .iter()
2096 .filter(|t| t.kind != TokenKind::Whitespace)
2097 .collect();
2098 assert_eq!(sig[3].text, "users");
2099 assert_eq!(sig[3].kind, TokenKind::Identifier);
2100 }
2101
2102 #[test]
2103 fn key_not_keyword_after_from() {
2104 let tokens = tokenize("SELECT key FROM my_table WHERE key = 1");
2105 let sig: Vec<_> = tokens
2106 .iter()
2107 .filter(|t| t.kind != TokenKind::Whitespace)
2108 .collect();
2109 assert_eq!(sig[1].kind, TokenKind::Identifier);
2111 assert_eq!(sig[3].kind, TokenKind::Identifier);
2113 assert_eq!(sig[5].kind, TokenKind::Identifier);
2115 }
2116
2117 #[test]
2118 fn set_not_keyword_in_column_list() {
2119 let tokens = tokenize("SELECT set FROM my_table");
2121 let sig: Vec<_> = tokens
2122 .iter()
2123 .filter(|t| t.kind != TokenKind::Whitespace)
2124 .collect();
2125 assert_eq!(sig[1].text, "set");
2126 assert_eq!(sig[1].kind, TokenKind::Identifier);
2127 }
2128
2129 #[test]
2130 fn column_names_after_where_are_identifiers() {
2131 let tokens = tokenize("SELECT * FROM t WHERE user = 'test' AND key = 1");
2132 let sig: Vec<_> = tokens
2133 .iter()
2134 .filter(|t| t.kind != TokenKind::Whitespace)
2135 .collect();
2136 assert_eq!(sig[5].text, "user");
2138 assert_eq!(sig[5].kind, TokenKind::Identifier);
2139 assert_eq!(sig[9].text, "key");
2141 assert_eq!(sig[9].kind, TokenKind::Identifier);
2142 }
2143
2144 #[test]
2147 fn select_with_function() {
2148 let tokens = tokenize("SELECT count(*) FROM users");
2149 let sig: Vec<_> = tokens
2150 .iter()
2151 .filter(|t| t.kind != TokenKind::Whitespace)
2152 .collect();
2153 assert_eq!(sig[0].kind, TokenKind::Keyword); assert_eq!(sig[1].kind, TokenKind::Identifier); }
2156
2157 #[test]
2158 fn batch_statement() {
2159 let input =
2160 "BEGIN BATCH INSERT INTO t (id) VALUES (1); INSERT INTO t (id) VALUES (2); APPLY BATCH";
2161 let tokens = tokenize(input);
2162 let keywords: Vec<_> = tokens
2163 .iter()
2164 .filter(|t| t.kind == TokenKind::Keyword)
2165 .collect();
2166 assert!(keywords.iter().any(|t| t.text.to_uppercase() == "BEGIN"));
2167 assert!(keywords.iter().any(|t| t.text.to_uppercase() == "BATCH"));
2168 assert!(keywords.iter().any(|t| t.text.to_uppercase() == "APPLY"));
2169 }
2170
2171 #[test]
2172 fn delete_from() {
2173 let kinds = significant_kinds("DELETE FROM users WHERE id = 1");
2174 assert_eq!(kinds[0], TokenKind::Keyword); assert_eq!(kinds[1], TokenKind::Keyword); assert_eq!(kinds[2], TokenKind::Identifier); }
2178
2179 #[test]
2180 fn describe_table() {
2181 let kinds = significant_kinds("DESCRIBE TABLE users");
2182 assert_eq!(kinds[0], TokenKind::Keyword); assert_eq!(kinds[1], TokenKind::Keyword); assert_eq!(kinds[2], TokenKind::Identifier); }
2186
2187 #[test]
2188 fn truncate_table() {
2189 let kinds = significant_kinds("TRUNCATE users");
2190 assert_eq!(kinds[0], TokenKind::Keyword); assert_eq!(kinds[1], TokenKind::Identifier); }
2193
2194 #[test]
2195 fn select_distinct() {
2196 let kinds = significant_kinds("SELECT DISTINCT partition_key FROM t");
2197 assert_eq!(kinds[0], TokenKind::Keyword); assert_eq!(kinds[1], TokenKind::Keyword); assert_eq!(kinds[2], TokenKind::Identifier); }
2201
2202 #[test]
2203 fn consistency_level() {
2204 let kinds = significant_kinds("CONSISTENCY QUORUM");
2205 assert_eq!(kinds[0], TokenKind::Keyword); assert_eq!(kinds[1], TokenKind::Identifier); }
2208
2209 #[test]
2210 fn serial_consistency() {
2211 let ctx = grammar_context_at_end("SERIAL CONSISTENCY ");
2212 assert_eq!(ctx, GrammarContext::ExpectConsistencyLevel);
2213 }
2214
2215 #[test]
2216 fn order_by_column() {
2217 let sig: Vec<_> = tokenize("SELECT * FROM t ORDER BY created_at")
2218 .into_iter()
2219 .filter(|t| t.kind != TokenKind::Whitespace)
2220 .collect();
2221 assert_eq!(sig.last().unwrap().kind, TokenKind::Identifier); }
2223
2224 #[test]
2227 fn grammar_context_qualified_table_after_from() {
2228 let ctx = grammar_context_at_end("SELECT * FROM system.");
2230 assert_eq!(ctx, GrammarContext::ExpectQualifiedPart);
2231 }
2232
2233 #[test]
2234 fn grammar_context_qualified_table_partial() {
2235 let ctx = grammar_context_at_end("SELECT * FROM system.c");
2237 assert_eq!(ctx, GrammarContext::ExpectQualifiedPart);
2241 }
2242
2243 #[test]
2244 fn grammar_context_select_column_list() {
2245 let ctx = grammar_context_at_end("SELECT ");
2246 assert_eq!(ctx, GrammarContext::ExpectColumnList);
2247 }
2248
2249 #[test]
2250 fn grammar_context_create_target() {
2251 assert_eq!(
2252 grammar_context_at_end("CREATE "),
2253 GrammarContext::ExpectCreateTarget
2254 );
2255 }
2256
2257 #[test]
2258 fn grammar_context_alter_target() {
2259 assert_eq!(
2260 grammar_context_at_end("ALTER "),
2261 GrammarContext::ExpectAlterTarget
2262 );
2263 }
2264
2265 #[test]
2266 fn grammar_context_drop_target() {
2267 assert_eq!(
2268 grammar_context_at_end("DROP "),
2269 GrammarContext::ExpectDropTarget
2270 );
2271 }
2272
2273 #[test]
2274 fn grammar_context_delete_target() {
2275 assert_eq!(
2276 grammar_context_at_end("DELETE "),
2277 GrammarContext::ExpectDeleteTarget
2278 );
2279 }
2280
2281 #[test]
2282 fn grammar_context_grant_permissions() {
2283 assert_eq!(
2284 grammar_context_at_end("GRANT "),
2285 GrammarContext::ExpectGrantRevoke
2286 );
2287 }
2288
2289 #[test]
2290 fn grammar_context_revoke_permissions() {
2291 assert_eq!(
2292 grammar_context_at_end("REVOKE "),
2293 GrammarContext::ExpectGrantRevoke
2294 );
2295 }
2296
2297 #[test]
2298 fn grammar_context_insert_target() {
2299 assert_eq!(
2300 grammar_context_at_end("INSERT "),
2301 GrammarContext::ExpectInsertTarget
2302 );
2303 }
2304
2305 #[test]
2306 fn grammar_context_insert_into_table() {
2307 assert_eq!(
2308 grammar_context_at_end("INSERT INTO "),
2309 GrammarContext::ExpectTable
2310 );
2311 }
2312
2313 #[test]
2314 fn grammar_context_begin_target() {
2315 assert_eq!(
2316 grammar_context_at_end("BEGIN "),
2317 GrammarContext::ExpectBeginTarget
2318 );
2319 }
2320
2321 #[test]
2322 fn grammar_context_update_expects_table() {
2323 assert_eq!(
2324 grammar_context_at_end("UPDATE "),
2325 GrammarContext::ExpectTable
2326 );
2327 }
2328
2329 #[test]
2330 fn grammar_context_truncate_expects_table() {
2331 assert_eq!(
2332 grammar_context_at_end("TRUNCATE "),
2333 GrammarContext::ExpectTable
2334 );
2335 }
2336
2337 #[test]
2340 fn grammar_context_select_post_from_qualified() {
2341 assert_eq!(
2343 grammar_context_at_end("SELECT * FROM test_ks.users "),
2344 GrammarContext::ExpectSelectPostFrom
2345 );
2346 }
2347
2348 #[test]
2349 fn grammar_context_update_clause_qualified() {
2350 assert_eq!(
2352 grammar_context_at_end("UPDATE test_ks.users "),
2353 GrammarContext::ExpectUpdateClause
2354 );
2355 }
2356
2357 #[test]
2358 fn grammar_context_delete_post_from_qualified() {
2359 assert_eq!(
2361 grammar_context_at_end("DELETE FROM test_ks.users "),
2362 GrammarContext::ExpectDeletePostFrom
2363 );
2364 }
2365
2366 #[test]
2367 fn grammar_context_insert_into_qualified() {
2368 assert_eq!(
2371 grammar_context_at_end("INSERT INTO test_ks.users "),
2372 GrammarContext::General
2373 );
2374 }
2375}