// Ion Text 1.0 ANTLR v4 Grammar // // The following grammar does not encode the Ion Text semantics completely or // unambiguously. // Known gaps: // * Timestamps are syntactically defined but the rules of ISO 8601 need to be // applied (especially regarding day rules with months and leap years). // * Non $ion_1_0 version markers are not trapped (e.g. $ion_1_1, $ion_2_0) // * The semantics of Symbol Ids, Local Symbol Table defines and imports are not validated // * Edge cases around Unicode semantics: // - ANTLR specifies only four hex digit unicode escapes and on Java operates // on UTF-16 code units (this is a flaw in ANTLR). // - The grammar doesn't validate unpaired surrogate escapes in symbols or strings // (e.g. "\udc00") // * There are some ambiguities involving symbol operators within s-expressions // - Do '-' and '+' (possibly others) right or left associate? // - Do symbol operators terminate special float symbols (nan, -inf, +inf)? grammar IonText; // note that EOF is a concept for the grammar, technically Ion streams // are infinite top_level : (ws* top_level_value)* ws* value? EOF ; top_level_value : annotation+ top_level_value | delimiting_entity // numeric literals (if followed by something), need to be followed by // whitespace or a token that is either quoted (e.g. string) or // starts with punctuation (e.g. clob, struct, list) | numeric_entity ws | numeric_entity quoted_annotation value | numeric_entity delimiting_entity // literals that are unquoted symbols or keywords have a similar requirement // as the numerics above, they have different productions because the // rules for numerics are the same in s-expressions, but keywords // have different rules between top-level and s-expressions. | keyword_entity ws | keyword_entity quoted_annotation value | keyword_entity keyword_delimiting_entity ; // TODO let's make sure this terminology // is consistent with our specification documents value : annotation* entity ; entity : numeric_entity | delimiting_entity | keyword_entity ; delimiting_entity : quoted_text | SHORT_QUOTED_CLOB | LONG_QUOTED_CLOB | BLOB | list | sexp | struct ; keyword_delimiting_entity : delimiting_entity | numeric_entity ; keyword_entity : any_null | BOOL | SPECIAL_FLOAT | IDENTIFIER_SYMBOL // note that this is because we recognize the type names for null // they are ordinary symbols on their own | TYPE ; numeric_entity : BIN_INTEGER | DEC_INTEGER | HEX_INTEGER | TIMESTAMP | FLOAT | DECIMAL ; annotation : symbol ws* COLON COLON ws* ; quoted_annotation : QUOTED_SYMBOL ws* COLON COLON ws* ; list : L_BRACKET ws* value ws* (COMMA ws* value)* ws* (COMMA ws*)? R_BRACKET | L_BRACKET ws* R_BRACKET ; sexp : L_PAREN (ws* sexp_value)* ws* value? R_PAREN ; sexp_value : annotation+ sexp_value | sexp_delimiting_entity | operator // much like at the top level, numeric/identifiers/keywords // have similar delimiting rules | numeric_entity ws | numeric_entity quoted_annotation value | numeric_entity sexp_delimiting_entity | sexp_keyword_entity ws | sexp_keyword_entity quoted_annotation value | sexp_keyword_entity sexp_keyword_delimiting_entity | NULL ws | NULL quoted_annotation value | NULL sexp_null_delimiting_entity ; sexp_delimiting_entity : delimiting_entity ; sexp_keyword_delimiting_entity : sexp_delimiting_entity | numeric_entity | operator ; sexp_null_delimiting_entity : delimiting_entity | NON_DOT_OPERATOR+ ; sexp_keyword_entity : typed_null | BOOL | SPECIAL_FLOAT | IDENTIFIER_SYMBOL // note that this is because we recognize the type names for null // they are ordinary symbols on their own | TYPE ; operator : (DOT | NON_DOT_OPERATOR)+ ; struct : L_CURLY ws* field (ws* COMMA ws* field)* ws* (COMMA ws*)? R_CURLY | L_CURLY ws* R_CURLY ; field : field_name ws* COLON ws* annotation* entity ; any_null : NULL | typed_null ; typed_null : NULL DOT NULL | NULL DOT TYPE ; field_name : symbol | SHORT_QUOTED_STRING | (ws* LONG_QUOTED_STRING)+ ; quoted_text : QUOTED_SYMBOL | SHORT_QUOTED_STRING | (ws* LONG_QUOTED_STRING)+ ; symbol : IDENTIFIER_SYMBOL // note that this is because we recognize the type names for null // they are ordinary symbols on their own | TYPE | QUOTED_SYMBOL ; ws : WHITESPACE | INLINE_COMMENT | BLOCK_COMMENT ; ////////////////////////////////////////////////////////////////////////////// // Ion Punctuation ////////////////////////////////////////////////////////////////////////////// L_BRACKET : '['; R_BRACKET : ']'; L_PAREN : '('; R_PAREN : ')'; L_CURLY : '{'; R_CURLY : '}'; COMMA : ','; COLON : ':'; DOT : '.'; NON_DOT_OPERATOR : [!#%&*+\-/;<=>?@^`|~] ; ////////////////////////////////////////////////////////////////////////////// // Ion Whitespace / Comments ////////////////////////////////////////////////////////////////////////////// WHITESPACE : WS+ ; INLINE_COMMENT : '//' .*? (NL | EOF) ; BLOCK_COMMENT : '/*' .*? '*/' ; ////////////////////////////////////////////////////////////////////////////// // Ion Null ////////////////////////////////////////////////////////////////////////////// NULL : 'null' ; TYPE : 'bool' | 'int' | 'float' | 'decimal' | 'timestamp' | 'symbol' | 'string' | 'clob' | 'blob' | 'list' | 'sexp' | 'struct' ; ////////////////////////////////////////////////////////////////////////////// // Ion Bool ////////////////////////////////////////////////////////////////////////////// BOOL : 'true' | 'false' ; ////////////////////////////////////////////////////////////////////////////// // Ion Timestamp ////////////////////////////////////////////////////////////////////////////// TIMESTAMP : DATE ('T' TIME?)? | YEAR '-' MONTH 'T' | YEAR 'T' ; fragment DATE : YEAR '-' MONTH '-' DAY ; fragment YEAR : '000' [1-9] | '00' [1-9] DEC_DIGIT | '0' [1-9] DEC_DIGIT DEC_DIGIT | [1-9] DEC_DIGIT DEC_DIGIT DEC_DIGIT ; fragment MONTH : '0' [1-9] | '1' [0-2] ; fragment DAY : '0' [1-9] | [1-2] DEC_DIGIT | '3' [0-1] ; fragment TIME : HOUR ':' MINUTE (':' SECOND)? OFFSET ; fragment OFFSET : 'Z' | PLUS_OR_MINUS HOUR ':' MINUTE ; fragment HOUR : [01] DEC_DIGIT | '2' [0-3] ; fragment MINUTE : [0-5] DEC_DIGIT ; // note that W3C spec requires a digit after the '.' fragment SECOND : [0-5] DEC_DIGIT ('.' DEC_DIGIT+)? ; ////////////////////////////////////////////////////////////////////////////// // Ion Int ////////////////////////////////////////////////////////////////////////////// BIN_INTEGER : '-'? '0' [bB] BINARY_DIGIT (UNDERSCORE? BINARY_DIGIT)* ; DEC_INTEGER : '-'? DEC_UNSIGNED_INTEGER ; HEX_INTEGER : '-'? '0' [xX] HEX_DIGIT (UNDERSCORE? HEX_DIGIT)* ; ////////////////////////////////////////////////////////////////////////////// // Ion Float ////////////////////////////////////////////////////////////////////////////// SPECIAL_FLOAT : PLUS_OR_MINUS 'inf' | 'nan' ; FLOAT : DEC_INTEGER DEC_FRAC? FLOAT_EXP ; fragment FLOAT_EXP : [Ee] PLUS_OR_MINUS? DEC_DIGIT+ ; ////////////////////////////////////////////////////////////////////////////// // Ion Decimal ////////////////////////////////////////////////////////////////////////////// DECIMAL : DEC_INTEGER DEC_FRAC? DECIMAL_EXP? ; fragment DECIMAL_EXP : [Dd] PLUS_OR_MINUS? DEC_DIGIT+ ; ////////////////////////////////////////////////////////////////////////////// // Ion Symbol ////////////////////////////////////////////////////////////////////////////// QUOTED_SYMBOL : SYMBOL_QUOTE SYMBOL_TEXT SYMBOL_QUOTE ; fragment SYMBOL_TEXT : (TEXT_ESCAPE | SYMBOL_TEXT_ALLOWED)* ; // non-control Unicode and not single quote or backslash fragment SYMBOL_TEXT_ALLOWED : '\u0020'..'\u0026' // no C1 control characters and no U+0027 single quote | '\u0028'..'\u005B' // no U+005C backslash | '\u005D'..'\uFFFF' // should be up to U+10FFFF | WS_NOT_NL ; IDENTIFIER_SYMBOL : [$_a-zA-Z] ([$_a-zA-Z] | DEC_DIGIT)* ; ////////////////////////////////////////////////////////////////////////////// // Ion String ////////////////////////////////////////////////////////////////////////////// SHORT_QUOTED_STRING : SHORT_QUOTE STRING_SHORT_TEXT SHORT_QUOTE ; LONG_QUOTED_STRING : LONG_QUOTE STRING_LONG_TEXT LONG_QUOTE ; fragment STRING_SHORT_TEXT : (TEXT_ESCAPE | STRING_SHORT_TEXT_ALLOWED)* ; fragment STRING_LONG_TEXT : (TEXT_ESCAPE | STRING_LONG_TEXT_ALLOWED)*? ; // non-control Unicode and not double quote or backslash fragment STRING_SHORT_TEXT_ALLOWED : '\u0020'..'\u0021' // no C1 control characters and no U+0022 double quote | '\u0023'..'\u005B' // no U+005C backslash | '\u005D'..'\uFFFF' // FIXME should be up to U+10FFFF | WS_NOT_NL ; // non-control Unicode (newlines are OK) fragment STRING_LONG_TEXT_ALLOWED : '\u0020'..'\u005B' // no C1 control characters and no U+005C blackslash | '\u005D'..'\uFFFF' // FIXME should be up to U+10FFFF | WS ; fragment TEXT_ESCAPE : COMMON_ESCAPE | HEX_ESCAPE | UNICODE_ESCAPE ; ////////////////////////////////////////////////////////////////////////////// // Ion CLOB ////////////////////////////////////////////////////////////////////////////// SHORT_QUOTED_CLOB : LOB_START WS* SHORT_QUOTE CLOB_SHORT_TEXT SHORT_QUOTE WS* LOB_END ; LONG_QUOTED_CLOB : LOB_START (WS* LONG_QUOTE CLOB_LONG_TEXT*? LONG_QUOTE)+ WS* LOB_END ; fragment CLOB_SHORT_TEXT : (CLOB_ESCAPE | CLOB_SHORT_TEXT_ALLOWED)* ; fragment CLOB_LONG_TEXT : CLOB_LONG_TEXT_NO_QUOTE | '\'' CLOB_LONG_TEXT_NO_QUOTE | '\'\'' CLOB_LONG_TEXT_NO_QUOTE ; fragment CLOB_LONG_TEXT_NO_QUOTE : (CLOB_ESCAPE | CLOB_LONG_TEXT_ALLOWED) ; // non-control ASCII and not double quote or backslash fragment CLOB_SHORT_TEXT_ALLOWED : '\u0020'..'\u0021' // no U+0022 double quote | '\u0023'..'\u005B' // no U+005C backslash | '\u005D'..'\u007F' | WS_NOT_NL ; // non-control ASCII (newlines are OK) fragment CLOB_LONG_TEXT_ALLOWED : '\u0020'..'\u0026' // no U+0027 single quote | '\u0028'..'\u005B' // no U+005C blackslash | '\u005D'..'\u007F' | WS ; fragment CLOB_ESCAPE : COMMON_ESCAPE | HEX_ESCAPE ; ////////////////////////////////////////////////////////////////////////////// // Ion BLOB ////////////////////////////////////////////////////////////////////////////// BLOB : LOB_START (BASE_64_QUARTET | WS)* BASE_64_PAD? WS* LOB_END ; fragment BASE_64_PAD : BASE_64_PAD1 | BASE_64_PAD2 ; fragment BASE_64_QUARTET : BASE_64_CHAR WS* BASE_64_CHAR WS* BASE_64_CHAR WS* BASE_64_CHAR ; fragment BASE_64_PAD1 : BASE_64_CHAR WS* BASE_64_CHAR WS* BASE_64_CHAR WS* '=' ; fragment BASE_64_PAD2 : BASE_64_CHAR WS* BASE_64_CHAR WS* '=' WS* '=' ; fragment BASE_64_CHAR : [0-9a-zA-Z+/] ; ////////////////////////////////////////////////////////////////////////////// // Common Lexer Primitives ////////////////////////////////////////////////////////////////////////////// fragment LOB_START : '{{'; fragment LOB_END : '}}'; fragment SYMBOL_QUOTE : '\''; fragment SHORT_QUOTE : '"'; fragment LONG_QUOTE : '\'\'\''; // Ion does not allow leading zeros for base-10 numbers fragment DEC_UNSIGNED_INTEGER : '0' | [1-9] (UNDERSCORE? DEC_DIGIT)* ; fragment DEC_FRAC : '.' | '.' DEC_DIGIT (UNDERSCORE? DEC_DIGIT)* ; fragment DEC_DIGIT : [0-9] ; fragment HEX_DIGIT : [0-9a-fA-F] ; fragment BINARY_DIGIT : [01] ; fragment PLUS_OR_MINUS : [+\-] ; fragment COMMON_ESCAPE : '\\' COMMON_ESCAPE_CODE ; fragment COMMON_ESCAPE_CODE : 'a' | 'b' | 't' | 'n' | 'f' | 'r' | 'v' | '?' | '0' | '\'' | '"' | '/' | '\\' | NL ; fragment HEX_ESCAPE : '\\x' HEX_DIGIT HEX_DIGIT ; fragment UNICODE_ESCAPE : '\\u' HEX_DIGIT_QUARTET | '\\U000' HEX_DIGIT_QUARTET HEX_DIGIT | '\\U0010' HEX_DIGIT_QUARTET ; fragment HEX_DIGIT_QUARTET : HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT ; fragment WS : WS_NOT_NL | '\u000A' // line feed | '\u000D' // carriage return ; fragment NL : '\u000D\u000A' // carriage return + line feed | '\u000D' // carriage return | '\u000A' // line feed ; fragment WS_NOT_NL : '\u0009' // tab | '\u000B' // vertical tab | '\u000C' // form feed | '\u0020' // space ; fragment UNDERSCORE : '_' ;