sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7 8from sqlglot.optimizer.annotate_types import TypeAnnotator 9 10from sqlglot import exp, generator, jsonpath, parser, tokens, transforms 11from sqlglot._typing import E 12from sqlglot.dialects.dialect import ( 13 Dialect, 14 NormalizationStrategy, 15 annotate_with_type_lambda, 16 arg_max_or_min_no_count, 17 binary_from_function, 18 date_add_interval_sql, 19 datestrtodate_sql, 20 build_formatted_time, 21 filter_array_using_unnest, 22 if_sql, 23 inline_array_unless_query, 24 max_or_greatest, 25 min_or_least, 26 no_ilike_sql, 27 build_date_delta_with_interval, 28 regexp_replace_sql, 29 rename_func, 30 sha256_sql, 31 timestrtotime_sql, 32 ts_or_ds_add_cast, 33 unit_to_var, 34 strposition_sql, 35 groupconcat_sql, 36) 37from sqlglot.helper import seq_get, split_num_words 38from sqlglot.tokens import TokenType 39from sqlglot.generator import unsupported_args 40 41if t.TYPE_CHECKING: 42 from sqlglot._typing import Lit 43 44 from sqlglot.optimizer.annotate_types import TypeAnnotator 45 46logger = logging.getLogger("sqlglot") 47 48 49JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 50 51DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 52 53 54def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 55 if not expression.find_ancestor(exp.From, exp.Join): 56 return self.values_sql(expression) 57 58 structs = [] 59 alias = expression.args.get("alias") 60 for tup in expression.find_all(exp.Tuple): 61 field_aliases = ( 62 alias.columns 63 if alias and alias.columns 64 else (f"_c{i}" for i in range(len(tup.expressions))) 65 ) 66 expressions = [ 67 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 68 for name, fld in zip(field_aliases, tup.expressions) 69 ] 70 structs.append(exp.Struct(expressions=expressions)) 71 72 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 73 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 74 return self.unnest_sql( 75 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 76 ) 77 78 79def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 80 this = expression.this 81 if isinstance(this, exp.Schema): 82 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 83 else: 84 this = self.sql(this) 85 return f"RETURNS {this}" 86 87 88def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 89 returns = expression.find(exp.ReturnsProperty) 90 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 91 expression.set("kind", "TABLE FUNCTION") 92 93 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 94 expression.set("expression", expression.expression.this) 95 96 return self.create_sql(expression) 97 98 99# https://issuetracker.google.com/issues/162294746 100# workaround for bigquery bug when grouping by an expression and then ordering 101# WITH x AS (SELECT 1 y) 102# SELECT y + 1 z 103# FROM x 104# GROUP BY x + 1 105# ORDER by z 106def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 107 if isinstance(expression, exp.Select): 108 group = expression.args.get("group") 109 order = expression.args.get("order") 110 111 if group and order: 112 aliases = { 113 select.this: select.args["alias"] 114 for select in expression.selects 115 if isinstance(select, exp.Alias) 116 } 117 118 for grouped in group.expressions: 119 if grouped.is_int: 120 continue 121 alias = aliases.get(grouped) 122 if alias: 123 grouped.replace(exp.column(alias)) 124 125 return expression 126 127 128def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 129 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 130 if isinstance(expression, exp.CTE) and expression.alias_column_names: 131 cte_query = expression.this 132 133 if cte_query.is_star: 134 logger.warning( 135 "Can't push down CTE column names for star queries. Run the query through" 136 " the optimizer or use 'qualify' to expand the star projections first." 137 ) 138 return expression 139 140 column_names = expression.alias_column_names 141 expression.args["alias"].set("columns", None) 142 143 for name, select in zip(column_names, cte_query.selects): 144 to_replace = select 145 146 if isinstance(select, exp.Alias): 147 select = select.this 148 149 # Inner aliases are shadowed by the CTE column names 150 to_replace.replace(exp.alias_(select, name)) 151 152 return expression 153 154 155def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 156 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 157 this.set("zone", seq_get(args, 2)) 158 return this 159 160 161def _build_timestamp(args: t.List) -> exp.Timestamp: 162 timestamp = exp.Timestamp.from_arg_list(args) 163 timestamp.set("with_tz", True) 164 return timestamp 165 166 167def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 168 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 169 return expr_type.from_arg_list(args) 170 171 172def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 173 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 174 arg = seq_get(args, 0) 175 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 176 177 178def _build_json_strip_nulls(args: t.List) -> exp.JSONStripNulls: 179 expression = exp.JSONStripNulls(this=seq_get(args, 0)) 180 181 for arg in args[1:]: 182 if isinstance(arg, exp.Kwarg): 183 expression.set(arg.this.name.lower(), arg) 184 else: 185 expression.set("expression", arg) 186 187 return expression 188 189 190def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 191 return self.sql( 192 exp.Exists( 193 this=exp.select("1") 194 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 195 .where(exp.column("_col").eq(expression.right)) 196 ) 197 ) 198 199 200def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 201 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 202 203 204def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 205 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 206 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 207 unit = unit_to_var(expression) 208 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 209 210 211def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 212 scale = expression.args.get("scale") 213 timestamp = expression.this 214 215 if scale in (None, exp.UnixToTime.SECONDS): 216 return self.func("TIMESTAMP_SECONDS", timestamp) 217 if scale == exp.UnixToTime.MILLIS: 218 return self.func("TIMESTAMP_MILLIS", timestamp) 219 if scale == exp.UnixToTime.MICROS: 220 return self.func("TIMESTAMP_MICROS", timestamp) 221 222 unix_seconds = exp.cast( 223 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 224 ) 225 return self.func("TIMESTAMP_SECONDS", unix_seconds) 226 227 228def _build_time(args: t.List) -> exp.Func: 229 if len(args) == 1: 230 return exp.TsOrDsToTime(this=args[0]) 231 if len(args) == 2: 232 return exp.Time.from_arg_list(args) 233 return exp.TimeFromParts.from_arg_list(args) 234 235 236def _build_datetime(args: t.List) -> exp.Func: 237 if len(args) == 1: 238 return exp.TsOrDsToDatetime.from_arg_list(args) 239 if len(args) == 2: 240 return exp.Datetime.from_arg_list(args) 241 return exp.TimestampFromParts.from_arg_list(args) 242 243 244def _build_regexp_extract( 245 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 246) -> t.Callable[[t.List], E]: 247 def _builder(args: t.List) -> E: 248 try: 249 group = re.compile(args[1].name).groups == 1 250 except re.error: 251 group = False 252 253 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 254 return expr_type( 255 this=seq_get(args, 0), 256 expression=seq_get(args, 1), 257 position=seq_get(args, 2), 258 occurrence=seq_get(args, 3), 259 group=exp.Literal.number(1) if group else default_group, 260 ) 261 262 return _builder 263 264 265def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 266 def _builder(args: t.List, dialect: Dialect) -> E: 267 if len(args) == 1: 268 # The default value for the JSONPath is '$' i.e all of the data 269 args.append(exp.Literal.string("$")) 270 return parser.build_extract_json_with_path(expr_type)(args, dialect) 271 272 return _builder 273 274 275def _str_to_datetime_sql( 276 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 277) -> str: 278 this = self.sql(expression, "this") 279 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 280 281 if expression.args.get("safe"): 282 fmt = self.format_time( 283 expression, 284 self.dialect.INVERSE_FORMAT_MAPPING, 285 self.dialect.INVERSE_FORMAT_TRIE, 286 ) 287 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 288 289 fmt = self.format_time(expression) 290 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 291 292 293def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 294 """ 295 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 296 +---------+---------+---------+------------+---------+ 297 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 298 +---------+---------+---------+------------+---------+ 299 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 300 +---------+---------+---------+------------+---------+ 301 """ 302 self._annotate_args(expression) 303 304 this: exp.Expression = expression.this 305 306 self._set_type( 307 expression, 308 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 309 ) 310 return expression 311 312 313def _annotate_by_args_with_coerce(self: TypeAnnotator, expression: E) -> E: 314 """ 315 +------------+------------+------------+-------------+---------+ 316 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 317 +------------+------------+------------+-------------+---------+ 318 | INT64 | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 319 | NUMERIC | NUMERIC | NUMERIC | BIGNUMERIC | FLOAT64 | 320 | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | FLOAT64 | 321 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | 322 +------------+------------+------------+-------------+---------+ 323 """ 324 self._annotate_args(expression) 325 326 self._set_type(expression, self._maybe_coerce(expression.this.type, expression.expression.type)) 327 return expression 328 329 330def _annotate_by_args_approx_top(self: TypeAnnotator, expression: exp.ApproxTopK) -> exp.ApproxTopK: 331 self._annotate_args(expression) 332 333 struct_type = exp.DataType( 334 this=exp.DataType.Type.STRUCT, 335 expressions=[expression.this.type, exp.DataType(this=exp.DataType.Type.BIGINT)], 336 nested=True, 337 ) 338 self._set_type( 339 expression, 340 exp.DataType(this=exp.DataType.Type.ARRAY, expressions=[struct_type], nested=True), 341 ) 342 343 return expression 344 345 346@unsupported_args("ins_cost", "del_cost", "sub_cost") 347def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 348 max_dist = expression.args.get("max_dist") 349 if max_dist: 350 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 351 352 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 353 354 355def _build_levenshtein(args: t.List) -> exp.Levenshtein: 356 max_dist = seq_get(args, 2) 357 return exp.Levenshtein( 358 this=seq_get(args, 0), 359 expression=seq_get(args, 1), 360 max_dist=max_dist.expression if max_dist else None, 361 ) 362 363 364def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 365 def _builder(args: t.List) -> exp.TimeToStr: 366 return exp.TimeToStr( 367 this=expr_type(this=seq_get(args, 1)), 368 format=seq_get(args, 0), 369 zone=seq_get(args, 2), 370 ) 371 372 return _builder 373 374 375def _build_contains_substring(args: t.List) -> exp.Contains: 376 # Lowercase the operands in case of transpilation, as exp.Contains 377 # is case-sensitive on other dialects 378 this = exp.Lower(this=seq_get(args, 0)) 379 expr = exp.Lower(this=seq_get(args, 1)) 380 381 return exp.Contains(this=this, expression=expr, json_scope=seq_get(args, 2)) 382 383 384def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 385 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 386 upper = name.upper() 387 388 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 389 390 if dquote_escaping: 391 self._quote_json_path_key_using_brackets = False 392 393 sql = rename_func(upper)(self, expression) 394 395 if dquote_escaping: 396 self._quote_json_path_key_using_brackets = True 397 398 return sql 399 400 401def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 402 annotated = self._annotate_by_args(expression, "expressions") 403 404 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 405 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 406 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 407 annotated.type = exp.DataType.Type.VARCHAR 408 409 return annotated 410 411 412def _annotate_array(self: TypeAnnotator, expression: exp.Array) -> exp.Array: 413 array_args = expression.expressions 414 415 # BigQuery behaves as follows: 416 # 417 # SELECT t, TYPEOF(t) FROM (SELECT 'foo') AS t -- foo, STRUCT<STRING> 418 # SELECT ARRAY(SELECT 'foo'), TYPEOF(ARRAY(SELECT 'foo')) -- foo, ARRAY<STRING> 419 if ( 420 len(array_args) == 1 421 and isinstance(select := array_args[0].unnest(), exp.Select) 422 and (query_type := select.meta.get("query_type")) is not None 423 and query_type.is_type(exp.DataType.Type.STRUCT) 424 and len(query_type.expressions) == 1 425 and isinstance(col_def := query_type.expressions[0], exp.ColumnDef) 426 and (projection_type := col_def.kind) is not None 427 and not projection_type.is_type(exp.DataType.Type.UNKNOWN) 428 ): 429 array_type = exp.DataType( 430 this=exp.DataType.Type.ARRAY, 431 expressions=[projection_type.copy()], 432 nested=True, 433 ) 434 return self._annotate_with_type(expression, array_type) 435 436 return self._annotate_by_args(expression, "expressions", array=True) 437 438 439class BigQuery(Dialect): 440 WEEK_OFFSET = -1 441 UNNEST_COLUMN_ONLY = True 442 SUPPORTS_USER_DEFINED_TYPES = False 443 SUPPORTS_SEMI_ANTI_JOIN = False 444 LOG_BASE_FIRST = False 445 HEX_LOWERCASE = True 446 FORCE_EARLY_ALIAS_REF_EXPANSION = True 447 PRESERVE_ORIGINAL_NAMES = True 448 HEX_STRING_IS_INTEGER_TYPE = True 449 450 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 451 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 452 453 # bigquery udfs are case sensitive 454 NORMALIZE_FUNCTIONS = False 455 456 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 457 TIME_MAPPING = { 458 "%D": "%m/%d/%y", 459 "%E6S": "%S.%f", 460 "%e": "%-d", 461 } 462 463 FORMAT_MAPPING = { 464 "DD": "%d", 465 "MM": "%m", 466 "MON": "%b", 467 "MONTH": "%B", 468 "YYYY": "%Y", 469 "YY": "%y", 470 "HH": "%I", 471 "HH12": "%I", 472 "HH24": "%H", 473 "MI": "%M", 474 "SS": "%S", 475 "SSSSS": "%f", 476 "TZH": "%z", 477 } 478 479 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 480 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 481 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 482 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 483 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 484 485 # All set operations require either a DISTINCT or ALL specifier 486 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 487 488 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 489 COERCES_TO = { 490 **TypeAnnotator.COERCES_TO, 491 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 492 } 493 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 494 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 495 496 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 497 TYPE_TO_EXPRESSIONS = { 498 **Dialect.TYPE_TO_EXPRESSIONS, 499 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 500 } 501 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 502 503 ANNOTATORS = { 504 **Dialect.ANNOTATORS, 505 **{ 506 expr_type: annotate_with_type_lambda(data_type) 507 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 508 for expr_type in expressions 509 }, 510 **{ 511 expr_type: lambda self, e: _annotate_math_functions(self, e) 512 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 513 }, 514 **{ 515 expr_type: lambda self, e: self._annotate_by_args(e, "this") 516 for expr_type in ( 517 exp.Abs, 518 exp.ArgMax, 519 exp.ArgMin, 520 exp.DateTrunc, 521 exp.DatetimeTrunc, 522 exp.FirstValue, 523 exp.GroupConcat, 524 exp.IgnoreNulls, 525 exp.JSONExtract, 526 exp.Lead, 527 exp.Left, 528 exp.Lower, 529 exp.NthValue, 530 exp.Pad, 531 exp.PercentileDisc, 532 exp.RegexpExtract, 533 exp.RegexpReplace, 534 exp.Repeat, 535 exp.Replace, 536 exp.RespectNulls, 537 exp.Reverse, 538 exp.Right, 539 exp.SafeNegate, 540 exp.Sign, 541 exp.Substring, 542 exp.TimestampTrunc, 543 exp.Translate, 544 exp.Trim, 545 exp.Upper, 546 ) 547 }, 548 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 549 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 556 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 558 exp.Array: _annotate_array, 559 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 560 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 561 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 567 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 568 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 569 e, exp.DataType.Type.BINARY 570 ), 571 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 572 e, exp.DataType.Type.VARCHAR 573 ), 574 exp.Concat: _annotate_concat, 575 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 576 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 585 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 586 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 587 e, exp.DataType.Type.DOUBLE 588 ), 589 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 590 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 591 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 592 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 593 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 594 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 595 ), 596 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 597 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 598 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 600 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 603 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 604 e, exp.DataType.Type.VARCHAR 605 ), 606 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 607 exp.JSONFormat: lambda self, e: self._annotate_with_type( 608 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 609 ), 610 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 611 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 612 ), 613 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 614 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 618 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 619 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 620 ), 621 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 622 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 623 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 624 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 625 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 626 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 627 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 628 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 629 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 630 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 631 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 632 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 633 e, exp.DataType.Type.BIGDECIMAL 634 ), 635 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 636 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 637 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 638 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 639 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 641 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 642 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 644 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 645 e, exp.DataType.Type.VARCHAR 646 ), 647 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 648 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 651 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 653 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 654 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 656 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 658 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 659 e, exp.DataType.Type.DATETIME 660 ), 661 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 662 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 664 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 665 ), 666 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 667 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 668 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 669 } 670 671 def normalize_identifier(self, expression: E) -> E: 672 if ( 673 isinstance(expression, exp.Identifier) 674 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 675 ): 676 parent = expression.parent 677 while isinstance(parent, exp.Dot): 678 parent = parent.parent 679 680 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 681 # by default. The following check uses a heuristic to detect tables based on whether 682 # they are qualified. This should generally be correct, because tables in BigQuery 683 # must be qualified with at least a dataset, unless @@dataset_id is set. 684 case_sensitive = ( 685 isinstance(parent, exp.UserDefinedFunction) 686 or ( 687 isinstance(parent, exp.Table) 688 and parent.db 689 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 690 ) 691 or expression.meta.get("is_table") 692 ) 693 if not case_sensitive: 694 expression.set("this", expression.this.lower()) 695 696 return t.cast(E, expression) 697 698 return super().normalize_identifier(expression) 699 700 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 701 VAR_TOKENS = { 702 TokenType.DASH, 703 TokenType.VAR, 704 } 705 706 class Tokenizer(tokens.Tokenizer): 707 QUOTES = ["'", '"', '"""', "'''"] 708 COMMENTS = ["--", "#", ("/*", "*/")] 709 IDENTIFIERS = ["`"] 710 STRING_ESCAPES = ["\\"] 711 712 HEX_STRINGS = [("0x", ""), ("0X", "")] 713 714 BYTE_STRINGS = [ 715 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 716 ] 717 718 RAW_STRINGS = [ 719 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 720 ] 721 722 NESTED_COMMENTS = False 723 724 KEYWORDS = { 725 **tokens.Tokenizer.KEYWORDS, 726 "ANY TYPE": TokenType.VARIANT, 727 "BEGIN": TokenType.COMMAND, 728 "BEGIN TRANSACTION": TokenType.BEGIN, 729 "BYTEINT": TokenType.INT, 730 "BYTES": TokenType.BINARY, 731 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 732 "DATETIME": TokenType.TIMESTAMP, 733 "DECLARE": TokenType.DECLARE, 734 "ELSEIF": TokenType.COMMAND, 735 "EXCEPTION": TokenType.COMMAND, 736 "EXPORT": TokenType.EXPORT, 737 "FLOAT64": TokenType.DOUBLE, 738 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 739 "LOOP": TokenType.COMMAND, 740 "MODEL": TokenType.MODEL, 741 "NOT DETERMINISTIC": TokenType.VOLATILE, 742 "RECORD": TokenType.STRUCT, 743 "REPEAT": TokenType.COMMAND, 744 "TIMESTAMP": TokenType.TIMESTAMPTZ, 745 "WHILE": TokenType.COMMAND, 746 } 747 KEYWORDS.pop("DIV") 748 KEYWORDS.pop("VALUES") 749 KEYWORDS.pop("/*+") 750 751 class Parser(parser.Parser): 752 PREFIXED_PIVOT_COLUMNS = True 753 LOG_DEFAULTS_TO_LN = True 754 SUPPORTS_IMPLICIT_UNNEST = True 755 JOINS_HAVE_EQUAL_PRECEDENCE = True 756 757 # BigQuery does not allow ASC/DESC to be used as an identifier 758 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 759 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 760 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 761 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 762 TokenType.ASC, 763 TokenType.DESC, 764 } 765 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 766 767 FUNCTIONS = { 768 **parser.Parser.FUNCTIONS, 769 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 770 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 771 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 772 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 773 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 774 "BOOL": exp.JSONBool.from_arg_list, 775 "CONTAINS_SUBSTR": _build_contains_substring, 776 "DATE": _build_date, 777 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 778 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 779 "DATE_TRUNC": lambda args: exp.DateTrunc( 780 unit=seq_get(args, 1), 781 this=seq_get(args, 0), 782 zone=seq_get(args, 2), 783 ), 784 "DATETIME": _build_datetime, 785 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 786 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 787 "DIV": binary_from_function(exp.IntDiv), 788 "EDIT_DISTANCE": _build_levenshtein, 789 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 790 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 791 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 792 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 793 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 794 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 795 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 796 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 797 "JSON_STRIP_NULLS": _build_json_strip_nulls, 798 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 799 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 800 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 801 "MD5": exp.MD5Digest.from_arg_list, 802 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 803 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 804 ), 805 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 806 "TO_HEX": _build_to_hex, 807 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 808 [seq_get(args, 1), seq_get(args, 0)] 809 ), 810 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 811 [seq_get(args, 1), seq_get(args, 0)] 812 ), 813 "PARSE_TIMESTAMP": _build_parse_timestamp, 814 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 815 [seq_get(args, 1), seq_get(args, 0)] 816 ), 817 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 818 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 819 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 820 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 821 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 822 ), 823 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 824 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 825 "SPLIT": lambda args: exp.Split( 826 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 827 this=seq_get(args, 0), 828 expression=seq_get(args, 1) or exp.Literal.string(","), 829 ), 830 "STRPOS": exp.StrPosition.from_arg_list, 831 "TIME": _build_time, 832 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 833 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 834 "TIMESTAMP": _build_timestamp, 835 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 836 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 837 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 838 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 839 ), 840 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 841 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 842 ), 843 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 844 "TO_JSON": lambda args: exp.JSONFormat( 845 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 846 ), 847 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 848 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 849 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 850 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 851 "FROM_HEX": exp.Unhex.from_arg_list, 852 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 853 } 854 855 FUNCTION_PARSERS = { 856 **parser.Parser.FUNCTION_PARSERS, 857 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 858 "JSON_ARRAY": lambda self: self.expression( 859 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 860 ), 861 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 862 "PREDICT": lambda self: self._parse_ml(exp.Predict), 863 "TRANSLATE": lambda self: self._parse_translate(), 864 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 865 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 866 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 867 exp.GenerateEmbedding, is_text=True 868 ), 869 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 870 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 871 } 872 FUNCTION_PARSERS.pop("TRIM") 873 874 NO_PAREN_FUNCTIONS = { 875 **parser.Parser.NO_PAREN_FUNCTIONS, 876 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 877 } 878 879 NESTED_TYPE_TOKENS = { 880 *parser.Parser.NESTED_TYPE_TOKENS, 881 TokenType.TABLE, 882 } 883 884 PROPERTY_PARSERS = { 885 **parser.Parser.PROPERTY_PARSERS, 886 "NOT DETERMINISTIC": lambda self: self.expression( 887 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 888 ), 889 "OPTIONS": lambda self: self._parse_with_property(), 890 } 891 892 CONSTRAINT_PARSERS = { 893 **parser.Parser.CONSTRAINT_PARSERS, 894 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 895 } 896 897 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 898 RANGE_PARSERS.pop(TokenType.OVERLAPS) 899 900 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 901 902 STATEMENT_PARSERS = { 903 **parser.Parser.STATEMENT_PARSERS, 904 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 905 TokenType.END: lambda self: self._parse_as_command(self._prev), 906 TokenType.FOR: lambda self: self._parse_for_in(), 907 TokenType.EXPORT: lambda self: self._parse_export_data(), 908 TokenType.DECLARE: lambda self: self._parse_declare(), 909 } 910 911 BRACKET_OFFSETS = { 912 "OFFSET": (0, False), 913 "ORDINAL": (1, False), 914 "SAFE_OFFSET": (0, True), 915 "SAFE_ORDINAL": (1, True), 916 } 917 918 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 919 index = self._index 920 this = self._parse_range() 921 self._match_text_seq("DO") 922 if self._match(TokenType.COMMAND): 923 self._retreat(index) 924 return self._parse_as_command(self._prev) 925 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 926 927 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 928 this = super()._parse_table_part(schema=schema) or self._parse_number() 929 930 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 931 if isinstance(this, exp.Identifier): 932 table_name = this.name 933 while self._match(TokenType.DASH, advance=False) and self._next: 934 start = self._curr 935 while self._is_connected() and not self._match_set( 936 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 937 ): 938 self._advance() 939 940 if start == self._curr: 941 break 942 943 table_name += self._find_sql(start, self._prev) 944 945 this = exp.Identifier( 946 this=table_name, quoted=this.args.get("quoted") 947 ).update_positions(this) 948 elif isinstance(this, exp.Literal): 949 table_name = this.name 950 951 if self._is_connected() and self._parse_var(any_token=True): 952 table_name += self._prev.text 953 954 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 955 956 return this 957 958 def _parse_table_parts( 959 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 960 ) -> exp.Table: 961 table = super()._parse_table_parts( 962 schema=schema, is_db_reference=is_db_reference, wildcard=True 963 ) 964 965 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 966 if not table.catalog: 967 if table.db: 968 previous_db = table.args["db"] 969 parts = table.db.split(".") 970 if len(parts) == 2 and not table.args["db"].quoted: 971 table.set( 972 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 973 ) 974 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 975 else: 976 previous_this = table.this 977 parts = table.name.split(".") 978 if len(parts) == 2 and not table.this.quoted: 979 table.set( 980 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 981 ) 982 table.set( 983 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 984 ) 985 986 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 987 alias = table.this 988 catalog, db, this, *rest = ( 989 exp.to_identifier(p, quoted=True) 990 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 991 ) 992 993 for part in (catalog, db, this): 994 if part: 995 part.update_positions(table.this) 996 997 if rest and this: 998 this = exp.Dot.build([this, *rest]) # type: ignore 999 1000 table = exp.Table( 1001 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1002 ) 1003 table.meta["quoted_table"] = True 1004 else: 1005 alias = None 1006 1007 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1008 # dataset, so if the project identifier is omitted we need to fix the ast so that 1009 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1010 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1011 # views, because it would seem like the "catalog" part is set, when it'd actually 1012 # be the region/dataset. Merging the two identifiers into a single one is done to 1013 # avoid producing a 4-part Table reference, which would cause issues in the schema 1014 # module, when there are 3-part table names mixed with information schema views. 1015 # 1016 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1017 table_parts = table.parts 1018 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1019 # We need to alias the table here to avoid breaking existing qualified columns. 1020 # This is expected to be safe, because if there's an actual alias coming up in 1021 # the token stream, it will overwrite this one. If there isn't one, we are only 1022 # exposing the name that can be used to reference the view explicitly (a no-op). 1023 exp.alias_( 1024 table, 1025 t.cast(exp.Identifier, alias or table_parts[-1]), 1026 table=True, 1027 copy=False, 1028 ) 1029 1030 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1031 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1032 line=table_parts[-2].meta.get("line"), 1033 col=table_parts[-1].meta.get("col"), 1034 start=table_parts[-2].meta.get("start"), 1035 end=table_parts[-1].meta.get("end"), 1036 ) 1037 table.set("this", new_this) 1038 table.set("db", seq_get(table_parts, -3)) 1039 table.set("catalog", seq_get(table_parts, -4)) 1040 1041 return table 1042 1043 def _parse_column(self) -> t.Optional[exp.Expression]: 1044 column = super()._parse_column() 1045 if isinstance(column, exp.Column): 1046 parts = column.parts 1047 if any("." in p.name for p in parts): 1048 catalog, db, table, this, *rest = ( 1049 exp.to_identifier(p, quoted=True) 1050 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1051 ) 1052 1053 if rest and this: 1054 this = exp.Dot.build([this, *rest]) # type: ignore 1055 1056 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1057 column.meta["quoted_column"] = True 1058 1059 return column 1060 1061 @t.overload 1062 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1063 1064 @t.overload 1065 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1066 1067 def _parse_json_object(self, agg=False): 1068 json_object = super()._parse_json_object() 1069 array_kv_pair = seq_get(json_object.expressions, 0) 1070 1071 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1072 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1073 if ( 1074 array_kv_pair 1075 and isinstance(array_kv_pair.this, exp.Array) 1076 and isinstance(array_kv_pair.expression, exp.Array) 1077 ): 1078 keys = array_kv_pair.this.expressions 1079 values = array_kv_pair.expression.expressions 1080 1081 json_object.set( 1082 "expressions", 1083 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1084 ) 1085 1086 return json_object 1087 1088 def _parse_bracket( 1089 self, this: t.Optional[exp.Expression] = None 1090 ) -> t.Optional[exp.Expression]: 1091 bracket = super()._parse_bracket(this) 1092 1093 if this is bracket: 1094 return bracket 1095 1096 if isinstance(bracket, exp.Bracket): 1097 for expression in bracket.expressions: 1098 name = expression.name.upper() 1099 1100 if name not in self.BRACKET_OFFSETS: 1101 break 1102 1103 offset, safe = self.BRACKET_OFFSETS[name] 1104 bracket.set("offset", offset) 1105 bracket.set("safe", safe) 1106 expression.replace(expression.expressions[0]) 1107 1108 return bracket 1109 1110 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1111 unnest = super()._parse_unnest(with_alias=with_alias) 1112 1113 if not unnest: 1114 return None 1115 1116 unnest_expr = seq_get(unnest.expressions, 0) 1117 if unnest_expr: 1118 from sqlglot.optimizer.annotate_types import annotate_types 1119 1120 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1121 1122 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1123 # in contrast to other dialects such as DuckDB which flattens only the array by default 1124 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1125 array_elem.is_type(exp.DataType.Type.STRUCT) 1126 for array_elem in unnest_expr._type.expressions 1127 ): 1128 unnest.set("explode_array", True) 1129 1130 return unnest 1131 1132 def _parse_make_interval(self) -> exp.MakeInterval: 1133 expr = exp.MakeInterval() 1134 1135 for arg_key in expr.arg_types: 1136 value = self._parse_lambda() 1137 1138 if not value: 1139 break 1140 1141 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1142 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1143 if isinstance(value, exp.Kwarg): 1144 arg_key = value.this.name 1145 1146 expr.set(arg_key, value) 1147 1148 self._match(TokenType.COMMA) 1149 1150 return expr 1151 1152 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1153 self._match_text_seq("MODEL") 1154 this = self._parse_table() 1155 1156 self._match(TokenType.COMMA) 1157 self._match_text_seq("TABLE") 1158 1159 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1160 expression = ( 1161 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1162 ) 1163 1164 self._match(TokenType.COMMA) 1165 1166 return self.expression( 1167 expr_type, 1168 this=this, 1169 expression=expression, 1170 params_struct=self._parse_bitwise(), 1171 **kwargs, 1172 ) 1173 1174 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1175 # Check if this is ML.TRANSLATE by looking at previous tokens 1176 token = seq_get(self._tokens, self._index - 4) 1177 if token and token.text.upper() == "ML": 1178 return self._parse_ml(exp.MLTranslate) 1179 1180 return exp.Translate.from_arg_list(self._parse_function_args()) 1181 1182 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1183 self._match(TokenType.TABLE) 1184 this = self._parse_table() 1185 1186 expr = self.expression(exp.FeaturesAtTime, this=this) 1187 1188 while self._match(TokenType.COMMA): 1189 arg = self._parse_lambda() 1190 1191 # Get the LHS of the Kwarg and set the arg to that value, e.g 1192 # "num_rows => 1" sets the expr's `num_rows` arg 1193 if arg: 1194 expr.set(arg.this.name, arg) 1195 1196 return expr 1197 1198 def _parse_vector_search(self) -> exp.VectorSearch: 1199 self._match(TokenType.TABLE) 1200 base_table = self._parse_table() 1201 1202 self._match(TokenType.COMMA) 1203 1204 column_to_search = self._parse_bitwise() 1205 self._match(TokenType.COMMA) 1206 1207 self._match(TokenType.TABLE) 1208 query_table = self._parse_table() 1209 1210 expr = self.expression( 1211 exp.VectorSearch, 1212 this=base_table, 1213 column_to_search=column_to_search, 1214 query_table=query_table, 1215 ) 1216 1217 while self._match(TokenType.COMMA): 1218 # query_column_to_search can be named argument or positional 1219 if self._match(TokenType.STRING, advance=False): 1220 query_column = self._parse_string() 1221 expr.set("query_column_to_search", query_column) 1222 else: 1223 arg = self._parse_lambda() 1224 if arg: 1225 expr.set(arg.this.name, arg) 1226 1227 return expr 1228 1229 def _parse_export_data(self) -> exp.Export: 1230 self._match_text_seq("DATA") 1231 1232 return self.expression( 1233 exp.Export, 1234 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1235 options=self._parse_properties(), 1236 this=self._match_text_seq("AS") and self._parse_select(), 1237 ) 1238 1239 class Generator(generator.Generator): 1240 INTERVAL_ALLOWS_PLURAL_FORM = False 1241 JOIN_HINTS = False 1242 QUERY_HINTS = False 1243 TABLE_HINTS = False 1244 LIMIT_FETCH = "LIMIT" 1245 RENAME_TABLE_WITH_DB = False 1246 NVL2_SUPPORTED = False 1247 UNNEST_WITH_ORDINALITY = False 1248 COLLATE_IS_FUNC = True 1249 LIMIT_ONLY_LITERALS = True 1250 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1251 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1252 JSON_KEY_VALUE_PAIR_SEP = "," 1253 NULL_ORDERING_SUPPORTED = False 1254 IGNORE_NULLS_IN_FUNC = True 1255 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1256 CAN_IMPLEMENT_ARRAY_ANY = True 1257 SUPPORTS_TO_NUMBER = False 1258 NAMED_PLACEHOLDER_TOKEN = "@" 1259 HEX_FUNC = "TO_HEX" 1260 WITH_PROPERTIES_PREFIX = "OPTIONS" 1261 SUPPORTS_EXPLODING_PROJECTIONS = False 1262 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1263 SUPPORTS_UNIX_SECONDS = True 1264 1265 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1266 1267 TS_OR_DS_TYPES = ( 1268 exp.TsOrDsToDatetime, 1269 exp.TsOrDsToTimestamp, 1270 exp.TsOrDsToTime, 1271 exp.TsOrDsToDate, 1272 ) 1273 1274 TRANSFORMS = { 1275 **generator.Generator.TRANSFORMS, 1276 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1277 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1278 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1279 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1280 exp.Array: inline_array_unless_query, 1281 exp.ArrayContains: _array_contains_sql, 1282 exp.ArrayFilter: filter_array_using_unnest, 1283 exp.ArrayRemove: filter_array_using_unnest, 1284 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1285 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1286 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1287 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1288 exp.ByteLength: rename_func("BYTE_LENGTH"), 1289 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1290 exp.CollateProperty: lambda self, e: ( 1291 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1292 if e.args.get("default") 1293 else f"COLLATE {self.sql(e, 'this')}" 1294 ), 1295 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1296 exp.CountIf: rename_func("COUNTIF"), 1297 exp.Create: _create_sql, 1298 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1299 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1300 exp.DateDiff: lambda self, e: self.func( 1301 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1302 ), 1303 exp.DateFromParts: rename_func("DATE"), 1304 exp.DateStrToDate: datestrtodate_sql, 1305 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1306 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1307 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1308 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1309 exp.FromTimeZone: lambda self, e: self.func( 1310 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1311 ), 1312 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1313 exp.GroupConcat: lambda self, e: groupconcat_sql( 1314 self, e, func_name="STRING_AGG", within_group=False 1315 ), 1316 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1317 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1318 exp.If: if_sql(false_value="NULL"), 1319 exp.ILike: no_ilike_sql, 1320 exp.IntDiv: rename_func("DIV"), 1321 exp.Int64: rename_func("INT64"), 1322 exp.JSONBool: rename_func("BOOL"), 1323 exp.JSONExtract: _json_extract_sql, 1324 exp.JSONExtractArray: _json_extract_sql, 1325 exp.JSONExtractScalar: _json_extract_sql, 1326 exp.JSONFormat: lambda self, e: self.func( 1327 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1328 e.this, 1329 e.args.get("options"), 1330 ), 1331 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1332 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1333 exp.Levenshtein: _levenshtein_sql, 1334 exp.Max: max_or_greatest, 1335 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1336 exp.MD5Digest: rename_func("MD5"), 1337 exp.Min: min_or_least, 1338 exp.Normalize: lambda self, e: self.func( 1339 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1340 e.this, 1341 e.args.get("form"), 1342 ), 1343 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1344 exp.RegexpExtract: lambda self, e: self.func( 1345 "REGEXP_EXTRACT", 1346 e.this, 1347 e.expression, 1348 e.args.get("position"), 1349 e.args.get("occurrence"), 1350 ), 1351 exp.RegexpExtractAll: lambda self, e: self.func( 1352 "REGEXP_EXTRACT_ALL", e.this, e.expression 1353 ), 1354 exp.RegexpReplace: regexp_replace_sql, 1355 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1356 exp.ReturnsProperty: _returnsproperty_sql, 1357 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1358 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1359 exp.ParseDatetime: lambda self, e: self.func( 1360 "PARSE_DATETIME", self.format_time(e), e.this 1361 ), 1362 exp.Select: transforms.preprocess( 1363 [ 1364 transforms.explode_projection_to_unnest(), 1365 transforms.unqualify_unnest, 1366 transforms.eliminate_distinct_on, 1367 _alias_ordered_group, 1368 transforms.eliminate_semi_and_anti_joins, 1369 ] 1370 ), 1371 exp.SHA: rename_func("SHA1"), 1372 exp.SHA2: sha256_sql, 1373 exp.StabilityProperty: lambda self, e: ( 1374 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1375 ), 1376 exp.String: rename_func("STRING"), 1377 exp.StrPosition: lambda self, e: ( 1378 strposition_sql( 1379 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1380 ) 1381 ), 1382 exp.StrToDate: _str_to_datetime_sql, 1383 exp.StrToTime: _str_to_datetime_sql, 1384 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1385 exp.TimeFromParts: rename_func("TIME"), 1386 exp.TimestampFromParts: rename_func("DATETIME"), 1387 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1388 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1389 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1390 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1391 exp.TimeStrToTime: timestrtotime_sql, 1392 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1393 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1394 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1395 exp.TsOrDsToTime: rename_func("TIME"), 1396 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1397 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1398 exp.Unhex: rename_func("FROM_HEX"), 1399 exp.UnixDate: rename_func("UNIX_DATE"), 1400 exp.UnixToTime: _unix_to_time_sql, 1401 exp.Uuid: lambda *_: "GENERATE_UUID()", 1402 exp.Values: _derived_table_values_to_unnest, 1403 exp.VariancePop: rename_func("VAR_POP"), 1404 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1405 } 1406 1407 SUPPORTED_JSON_PATH_PARTS = { 1408 exp.JSONPathKey, 1409 exp.JSONPathRoot, 1410 exp.JSONPathSubscript, 1411 } 1412 1413 TYPE_MAPPING = { 1414 **generator.Generator.TYPE_MAPPING, 1415 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1416 exp.DataType.Type.BIGINT: "INT64", 1417 exp.DataType.Type.BINARY: "BYTES", 1418 exp.DataType.Type.BLOB: "BYTES", 1419 exp.DataType.Type.BOOLEAN: "BOOL", 1420 exp.DataType.Type.CHAR: "STRING", 1421 exp.DataType.Type.DECIMAL: "NUMERIC", 1422 exp.DataType.Type.DOUBLE: "FLOAT64", 1423 exp.DataType.Type.FLOAT: "FLOAT64", 1424 exp.DataType.Type.INT: "INT64", 1425 exp.DataType.Type.NCHAR: "STRING", 1426 exp.DataType.Type.NVARCHAR: "STRING", 1427 exp.DataType.Type.SMALLINT: "INT64", 1428 exp.DataType.Type.TEXT: "STRING", 1429 exp.DataType.Type.TIMESTAMP: "DATETIME", 1430 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1431 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1432 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1433 exp.DataType.Type.TINYINT: "INT64", 1434 exp.DataType.Type.ROWVERSION: "BYTES", 1435 exp.DataType.Type.UUID: "STRING", 1436 exp.DataType.Type.VARBINARY: "BYTES", 1437 exp.DataType.Type.VARCHAR: "STRING", 1438 exp.DataType.Type.VARIANT: "ANY TYPE", 1439 } 1440 1441 PROPERTIES_LOCATION = { 1442 **generator.Generator.PROPERTIES_LOCATION, 1443 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1444 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1445 } 1446 1447 # WINDOW comes after QUALIFY 1448 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1449 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1450 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1451 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1452 } 1453 1454 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1455 RESERVED_KEYWORDS = { 1456 "all", 1457 "and", 1458 "any", 1459 "array", 1460 "as", 1461 "asc", 1462 "assert_rows_modified", 1463 "at", 1464 "between", 1465 "by", 1466 "case", 1467 "cast", 1468 "collate", 1469 "contains", 1470 "create", 1471 "cross", 1472 "cube", 1473 "current", 1474 "default", 1475 "define", 1476 "desc", 1477 "distinct", 1478 "else", 1479 "end", 1480 "enum", 1481 "escape", 1482 "except", 1483 "exclude", 1484 "exists", 1485 "extract", 1486 "false", 1487 "fetch", 1488 "following", 1489 "for", 1490 "from", 1491 "full", 1492 "group", 1493 "grouping", 1494 "groups", 1495 "hash", 1496 "having", 1497 "if", 1498 "ignore", 1499 "in", 1500 "inner", 1501 "intersect", 1502 "interval", 1503 "into", 1504 "is", 1505 "join", 1506 "lateral", 1507 "left", 1508 "like", 1509 "limit", 1510 "lookup", 1511 "merge", 1512 "natural", 1513 "new", 1514 "no", 1515 "not", 1516 "null", 1517 "nulls", 1518 "of", 1519 "on", 1520 "or", 1521 "order", 1522 "outer", 1523 "over", 1524 "partition", 1525 "preceding", 1526 "proto", 1527 "qualify", 1528 "range", 1529 "recursive", 1530 "respect", 1531 "right", 1532 "rollup", 1533 "rows", 1534 "select", 1535 "set", 1536 "some", 1537 "struct", 1538 "tablesample", 1539 "then", 1540 "to", 1541 "treat", 1542 "true", 1543 "unbounded", 1544 "union", 1545 "unnest", 1546 "using", 1547 "when", 1548 "where", 1549 "window", 1550 "with", 1551 "within", 1552 } 1553 1554 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1555 unit = expression.unit 1556 unit_sql = unit.name if unit.is_string else self.sql(unit) 1557 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1558 1559 def mod_sql(self, expression: exp.Mod) -> str: 1560 this = expression.this 1561 expr = expression.expression 1562 return self.func( 1563 "MOD", 1564 this.unnest() if isinstance(this, exp.Paren) else this, 1565 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1566 ) 1567 1568 def column_parts(self, expression: exp.Column) -> str: 1569 if expression.meta.get("quoted_column"): 1570 # If a column reference is of the form `dataset.table`.name, we need 1571 # to preserve the quoted table path, otherwise the reference breaks 1572 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1573 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1574 return f"{table_path}.{self.sql(expression, 'this')}" 1575 1576 return super().column_parts(expression) 1577 1578 def table_parts(self, expression: exp.Table) -> str: 1579 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1580 # we need to make sure the correct quoting is used in each case. 1581 # 1582 # For example, if there is a CTE x that clashes with a schema name, then the former will 1583 # return the table y in that schema, whereas the latter will return the CTE's y column: 1584 # 1585 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1586 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1587 if expression.meta.get("quoted_table"): 1588 table_parts = ".".join(p.name for p in expression.parts) 1589 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1590 1591 return super().table_parts(expression) 1592 1593 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1594 this = expression.this 1595 if isinstance(this, exp.TsOrDsToDatetime): 1596 func_name = "FORMAT_DATETIME" 1597 elif isinstance(this, exp.TsOrDsToTimestamp): 1598 func_name = "FORMAT_TIMESTAMP" 1599 elif isinstance(this, exp.TsOrDsToTime): 1600 func_name = "FORMAT_TIME" 1601 else: 1602 func_name = "FORMAT_DATE" 1603 1604 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1605 return self.func( 1606 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1607 ) 1608 1609 def eq_sql(self, expression: exp.EQ) -> str: 1610 # Operands of = cannot be NULL in BigQuery 1611 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1612 if not isinstance(expression.parent, exp.Update): 1613 return "NULL" 1614 1615 return self.binary(expression, "=") 1616 1617 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1618 parent = expression.parent 1619 1620 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1621 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1622 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1623 return self.func( 1624 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1625 ) 1626 1627 return super().attimezone_sql(expression) 1628 1629 def trycast_sql(self, expression: exp.TryCast) -> str: 1630 return self.cast_sql(expression, safe_prefix="SAFE_") 1631 1632 def bracket_sql(self, expression: exp.Bracket) -> str: 1633 this = expression.this 1634 expressions = expression.expressions 1635 1636 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1637 arg = expressions[0] 1638 if arg.type is None: 1639 from sqlglot.optimizer.annotate_types import annotate_types 1640 1641 arg = annotate_types(arg, dialect=self.dialect) 1642 1643 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1644 # BQ doesn't support bracket syntax with string values for structs 1645 return f"{self.sql(this)}.{arg.name}" 1646 1647 expressions_sql = self.expressions(expression, flat=True) 1648 offset = expression.args.get("offset") 1649 1650 if offset == 0: 1651 expressions_sql = f"OFFSET({expressions_sql})" 1652 elif offset == 1: 1653 expressions_sql = f"ORDINAL({expressions_sql})" 1654 elif offset is not None: 1655 self.unsupported(f"Unsupported array offset: {offset}") 1656 1657 if expression.args.get("safe"): 1658 expressions_sql = f"SAFE_{expressions_sql}" 1659 1660 return f"{self.sql(this)}[{expressions_sql}]" 1661 1662 def in_unnest_op(self, expression: exp.Unnest) -> str: 1663 return self.sql(expression) 1664 1665 def version_sql(self, expression: exp.Version) -> str: 1666 if expression.name == "TIMESTAMP": 1667 expression.set("this", "SYSTEM_TIME") 1668 return super().version_sql(expression) 1669 1670 def contains_sql(self, expression: exp.Contains) -> str: 1671 this = expression.this 1672 expr = expression.expression 1673 1674 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1675 this = this.this 1676 expr = expr.this 1677 1678 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1679 1680 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1681 this = expression.this 1682 1683 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1684 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1685 # because they aren't literals and so the above syntax is invalid BigQuery. 1686 if isinstance(this, exp.Array): 1687 elem = seq_get(this.expressions, 0) 1688 if not (elem and elem.find(exp.Query)): 1689 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1690 1691 return super().cast_sql(expression, safe_prefix=safe_prefix) 1692 1693 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1694 variables = self.expressions(expression, "this") 1695 default = self.sql(expression, "default") 1696 default = f" DEFAULT {default}" if default else "" 1697 kind = self.sql(expression, "kind") 1698 kind = f" {kind}" if kind else "" 1699 1700 return f"{variables}{kind}{default}"
440class BigQuery(Dialect): 441 WEEK_OFFSET = -1 442 UNNEST_COLUMN_ONLY = True 443 SUPPORTS_USER_DEFINED_TYPES = False 444 SUPPORTS_SEMI_ANTI_JOIN = False 445 LOG_BASE_FIRST = False 446 HEX_LOWERCASE = True 447 FORCE_EARLY_ALIAS_REF_EXPANSION = True 448 PRESERVE_ORIGINAL_NAMES = True 449 HEX_STRING_IS_INTEGER_TYPE = True 450 451 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 452 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 453 454 # bigquery udfs are case sensitive 455 NORMALIZE_FUNCTIONS = False 456 457 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 458 TIME_MAPPING = { 459 "%D": "%m/%d/%y", 460 "%E6S": "%S.%f", 461 "%e": "%-d", 462 } 463 464 FORMAT_MAPPING = { 465 "DD": "%d", 466 "MM": "%m", 467 "MON": "%b", 468 "MONTH": "%B", 469 "YYYY": "%Y", 470 "YY": "%y", 471 "HH": "%I", 472 "HH12": "%I", 473 "HH24": "%H", 474 "MI": "%M", 475 "SS": "%S", 476 "SSSSS": "%f", 477 "TZH": "%z", 478 } 479 480 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 481 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 482 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 483 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 484 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 485 486 # All set operations require either a DISTINCT or ALL specifier 487 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 488 489 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 490 COERCES_TO = { 491 **TypeAnnotator.COERCES_TO, 492 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 493 } 494 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 495 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 496 497 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 498 TYPE_TO_EXPRESSIONS = { 499 **Dialect.TYPE_TO_EXPRESSIONS, 500 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 501 } 502 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 503 504 ANNOTATORS = { 505 **Dialect.ANNOTATORS, 506 **{ 507 expr_type: annotate_with_type_lambda(data_type) 508 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 509 for expr_type in expressions 510 }, 511 **{ 512 expr_type: lambda self, e: _annotate_math_functions(self, e) 513 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 514 }, 515 **{ 516 expr_type: lambda self, e: self._annotate_by_args(e, "this") 517 for expr_type in ( 518 exp.Abs, 519 exp.ArgMax, 520 exp.ArgMin, 521 exp.DateTrunc, 522 exp.DatetimeTrunc, 523 exp.FirstValue, 524 exp.GroupConcat, 525 exp.IgnoreNulls, 526 exp.JSONExtract, 527 exp.Lead, 528 exp.Left, 529 exp.Lower, 530 exp.NthValue, 531 exp.Pad, 532 exp.PercentileDisc, 533 exp.RegexpExtract, 534 exp.RegexpReplace, 535 exp.Repeat, 536 exp.Replace, 537 exp.RespectNulls, 538 exp.Reverse, 539 exp.Right, 540 exp.SafeNegate, 541 exp.Sign, 542 exp.Substring, 543 exp.TimestampTrunc, 544 exp.Translate, 545 exp.Trim, 546 exp.Upper, 547 ) 548 }, 549 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 556 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 558 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 559 exp.Array: _annotate_array, 560 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 561 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 567 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 568 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 569 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 570 e, exp.DataType.Type.BINARY 571 ), 572 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 573 e, exp.DataType.Type.VARCHAR 574 ), 575 exp.Concat: _annotate_concat, 576 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 585 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 586 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 587 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 588 e, exp.DataType.Type.DOUBLE 589 ), 590 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 591 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 592 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 593 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 594 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 595 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 596 ), 597 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 598 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 600 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 603 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 604 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 605 e, exp.DataType.Type.VARCHAR 606 ), 607 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 608 exp.JSONFormat: lambda self, e: self._annotate_with_type( 609 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 610 ), 611 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 612 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 613 ), 614 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 618 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 619 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 620 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 621 ), 622 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 623 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 624 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 625 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 626 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 627 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 628 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 629 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 630 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 631 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 632 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 633 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 634 e, exp.DataType.Type.BIGDECIMAL 635 ), 636 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 637 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 638 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 639 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 641 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 642 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 644 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 645 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 646 e, exp.DataType.Type.VARCHAR 647 ), 648 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 651 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 653 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 654 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 656 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 658 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 659 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 660 e, exp.DataType.Type.DATETIME 661 ), 662 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 664 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 665 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 666 ), 667 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 668 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 669 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 670 } 671 672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression) 700 701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 } 706 707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+") 751 752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier 759 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 760 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 761 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 762 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 763 TokenType.ASC, 764 TokenType.DESC, 765 } 766 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 767 768 FUNCTIONS = { 769 **parser.Parser.FUNCTIONS, 770 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 771 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 772 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 773 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 774 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 775 "BOOL": exp.JSONBool.from_arg_list, 776 "CONTAINS_SUBSTR": _build_contains_substring, 777 "DATE": _build_date, 778 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 779 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 780 "DATE_TRUNC": lambda args: exp.DateTrunc( 781 unit=seq_get(args, 1), 782 this=seq_get(args, 0), 783 zone=seq_get(args, 2), 784 ), 785 "DATETIME": _build_datetime, 786 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 787 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 788 "DIV": binary_from_function(exp.IntDiv), 789 "EDIT_DISTANCE": _build_levenshtein, 790 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 791 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 792 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 793 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 794 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 795 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 796 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 797 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 798 "JSON_STRIP_NULLS": _build_json_strip_nulls, 799 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 800 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 801 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 802 "MD5": exp.MD5Digest.from_arg_list, 803 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 804 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 805 ), 806 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 807 "TO_HEX": _build_to_hex, 808 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 809 [seq_get(args, 1), seq_get(args, 0)] 810 ), 811 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 812 [seq_get(args, 1), seq_get(args, 0)] 813 ), 814 "PARSE_TIMESTAMP": _build_parse_timestamp, 815 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 816 [seq_get(args, 1), seq_get(args, 0)] 817 ), 818 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 819 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 820 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 821 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 822 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 823 ), 824 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 825 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 826 "SPLIT": lambda args: exp.Split( 827 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 828 this=seq_get(args, 0), 829 expression=seq_get(args, 1) or exp.Literal.string(","), 830 ), 831 "STRPOS": exp.StrPosition.from_arg_list, 832 "TIME": _build_time, 833 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 834 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 835 "TIMESTAMP": _build_timestamp, 836 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 837 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 838 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 839 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 840 ), 841 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 842 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 843 ), 844 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 845 "TO_JSON": lambda args: exp.JSONFormat( 846 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 847 ), 848 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 849 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 850 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 851 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 852 "FROM_HEX": exp.Unhex.from_arg_list, 853 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 854 } 855 856 FUNCTION_PARSERS = { 857 **parser.Parser.FUNCTION_PARSERS, 858 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 859 "JSON_ARRAY": lambda self: self.expression( 860 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 861 ), 862 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 863 "PREDICT": lambda self: self._parse_ml(exp.Predict), 864 "TRANSLATE": lambda self: self._parse_translate(), 865 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 866 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 867 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 868 exp.GenerateEmbedding, is_text=True 869 ), 870 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 871 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 872 } 873 FUNCTION_PARSERS.pop("TRIM") 874 875 NO_PAREN_FUNCTIONS = { 876 **parser.Parser.NO_PAREN_FUNCTIONS, 877 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 878 } 879 880 NESTED_TYPE_TOKENS = { 881 *parser.Parser.NESTED_TYPE_TOKENS, 882 TokenType.TABLE, 883 } 884 885 PROPERTY_PARSERS = { 886 **parser.Parser.PROPERTY_PARSERS, 887 "NOT DETERMINISTIC": lambda self: self.expression( 888 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 889 ), 890 "OPTIONS": lambda self: self._parse_with_property(), 891 } 892 893 CONSTRAINT_PARSERS = { 894 **parser.Parser.CONSTRAINT_PARSERS, 895 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 896 } 897 898 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 899 RANGE_PARSERS.pop(TokenType.OVERLAPS) 900 901 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 902 903 STATEMENT_PARSERS = { 904 **parser.Parser.STATEMENT_PARSERS, 905 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 906 TokenType.END: lambda self: self._parse_as_command(self._prev), 907 TokenType.FOR: lambda self: self._parse_for_in(), 908 TokenType.EXPORT: lambda self: self._parse_export_data(), 909 TokenType.DECLARE: lambda self: self._parse_declare(), 910 } 911 912 BRACKET_OFFSETS = { 913 "OFFSET": (0, False), 914 "ORDINAL": (1, False), 915 "SAFE_OFFSET": (0, True), 916 "SAFE_ORDINAL": (1, True), 917 } 918 919 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 920 index = self._index 921 this = self._parse_range() 922 self._match_text_seq("DO") 923 if self._match(TokenType.COMMAND): 924 self._retreat(index) 925 return self._parse_as_command(self._prev) 926 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 927 928 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 929 this = super()._parse_table_part(schema=schema) or self._parse_number() 930 931 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 932 if isinstance(this, exp.Identifier): 933 table_name = this.name 934 while self._match(TokenType.DASH, advance=False) and self._next: 935 start = self._curr 936 while self._is_connected() and not self._match_set( 937 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 938 ): 939 self._advance() 940 941 if start == self._curr: 942 break 943 944 table_name += self._find_sql(start, self._prev) 945 946 this = exp.Identifier( 947 this=table_name, quoted=this.args.get("quoted") 948 ).update_positions(this) 949 elif isinstance(this, exp.Literal): 950 table_name = this.name 951 952 if self._is_connected() and self._parse_var(any_token=True): 953 table_name += self._prev.text 954 955 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 956 957 return this 958 959 def _parse_table_parts( 960 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 961 ) -> exp.Table: 962 table = super()._parse_table_parts( 963 schema=schema, is_db_reference=is_db_reference, wildcard=True 964 ) 965 966 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 967 if not table.catalog: 968 if table.db: 969 previous_db = table.args["db"] 970 parts = table.db.split(".") 971 if len(parts) == 2 and not table.args["db"].quoted: 972 table.set( 973 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 974 ) 975 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 976 else: 977 previous_this = table.this 978 parts = table.name.split(".") 979 if len(parts) == 2 and not table.this.quoted: 980 table.set( 981 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 982 ) 983 table.set( 984 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 985 ) 986 987 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 988 alias = table.this 989 catalog, db, this, *rest = ( 990 exp.to_identifier(p, quoted=True) 991 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 992 ) 993 994 for part in (catalog, db, this): 995 if part: 996 part.update_positions(table.this) 997 998 if rest and this: 999 this = exp.Dot.build([this, *rest]) # type: ignore 1000 1001 table = exp.Table( 1002 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1003 ) 1004 table.meta["quoted_table"] = True 1005 else: 1006 alias = None 1007 1008 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1009 # dataset, so if the project identifier is omitted we need to fix the ast so that 1010 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1011 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1012 # views, because it would seem like the "catalog" part is set, when it'd actually 1013 # be the region/dataset. Merging the two identifiers into a single one is done to 1014 # avoid producing a 4-part Table reference, which would cause issues in the schema 1015 # module, when there are 3-part table names mixed with information schema views. 1016 # 1017 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1018 table_parts = table.parts 1019 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1020 # We need to alias the table here to avoid breaking existing qualified columns. 1021 # This is expected to be safe, because if there's an actual alias coming up in 1022 # the token stream, it will overwrite this one. If there isn't one, we are only 1023 # exposing the name that can be used to reference the view explicitly (a no-op). 1024 exp.alias_( 1025 table, 1026 t.cast(exp.Identifier, alias or table_parts[-1]), 1027 table=True, 1028 copy=False, 1029 ) 1030 1031 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1032 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1033 line=table_parts[-2].meta.get("line"), 1034 col=table_parts[-1].meta.get("col"), 1035 start=table_parts[-2].meta.get("start"), 1036 end=table_parts[-1].meta.get("end"), 1037 ) 1038 table.set("this", new_this) 1039 table.set("db", seq_get(table_parts, -3)) 1040 table.set("catalog", seq_get(table_parts, -4)) 1041 1042 return table 1043 1044 def _parse_column(self) -> t.Optional[exp.Expression]: 1045 column = super()._parse_column() 1046 if isinstance(column, exp.Column): 1047 parts = column.parts 1048 if any("." in p.name for p in parts): 1049 catalog, db, table, this, *rest = ( 1050 exp.to_identifier(p, quoted=True) 1051 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1052 ) 1053 1054 if rest and this: 1055 this = exp.Dot.build([this, *rest]) # type: ignore 1056 1057 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1058 column.meta["quoted_column"] = True 1059 1060 return column 1061 1062 @t.overload 1063 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1064 1065 @t.overload 1066 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1067 1068 def _parse_json_object(self, agg=False): 1069 json_object = super()._parse_json_object() 1070 array_kv_pair = seq_get(json_object.expressions, 0) 1071 1072 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1073 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1074 if ( 1075 array_kv_pair 1076 and isinstance(array_kv_pair.this, exp.Array) 1077 and isinstance(array_kv_pair.expression, exp.Array) 1078 ): 1079 keys = array_kv_pair.this.expressions 1080 values = array_kv_pair.expression.expressions 1081 1082 json_object.set( 1083 "expressions", 1084 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1085 ) 1086 1087 return json_object 1088 1089 def _parse_bracket( 1090 self, this: t.Optional[exp.Expression] = None 1091 ) -> t.Optional[exp.Expression]: 1092 bracket = super()._parse_bracket(this) 1093 1094 if this is bracket: 1095 return bracket 1096 1097 if isinstance(bracket, exp.Bracket): 1098 for expression in bracket.expressions: 1099 name = expression.name.upper() 1100 1101 if name not in self.BRACKET_OFFSETS: 1102 break 1103 1104 offset, safe = self.BRACKET_OFFSETS[name] 1105 bracket.set("offset", offset) 1106 bracket.set("safe", safe) 1107 expression.replace(expression.expressions[0]) 1108 1109 return bracket 1110 1111 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1112 unnest = super()._parse_unnest(with_alias=with_alias) 1113 1114 if not unnest: 1115 return None 1116 1117 unnest_expr = seq_get(unnest.expressions, 0) 1118 if unnest_expr: 1119 from sqlglot.optimizer.annotate_types import annotate_types 1120 1121 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1122 1123 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1124 # in contrast to other dialects such as DuckDB which flattens only the array by default 1125 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1126 array_elem.is_type(exp.DataType.Type.STRUCT) 1127 for array_elem in unnest_expr._type.expressions 1128 ): 1129 unnest.set("explode_array", True) 1130 1131 return unnest 1132 1133 def _parse_make_interval(self) -> exp.MakeInterval: 1134 expr = exp.MakeInterval() 1135 1136 for arg_key in expr.arg_types: 1137 value = self._parse_lambda() 1138 1139 if not value: 1140 break 1141 1142 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1143 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1144 if isinstance(value, exp.Kwarg): 1145 arg_key = value.this.name 1146 1147 expr.set(arg_key, value) 1148 1149 self._match(TokenType.COMMA) 1150 1151 return expr 1152 1153 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1154 self._match_text_seq("MODEL") 1155 this = self._parse_table() 1156 1157 self._match(TokenType.COMMA) 1158 self._match_text_seq("TABLE") 1159 1160 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1161 expression = ( 1162 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1163 ) 1164 1165 self._match(TokenType.COMMA) 1166 1167 return self.expression( 1168 expr_type, 1169 this=this, 1170 expression=expression, 1171 params_struct=self._parse_bitwise(), 1172 **kwargs, 1173 ) 1174 1175 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1176 # Check if this is ML.TRANSLATE by looking at previous tokens 1177 token = seq_get(self._tokens, self._index - 4) 1178 if token and token.text.upper() == "ML": 1179 return self._parse_ml(exp.MLTranslate) 1180 1181 return exp.Translate.from_arg_list(self._parse_function_args()) 1182 1183 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1184 self._match(TokenType.TABLE) 1185 this = self._parse_table() 1186 1187 expr = self.expression(exp.FeaturesAtTime, this=this) 1188 1189 while self._match(TokenType.COMMA): 1190 arg = self._parse_lambda() 1191 1192 # Get the LHS of the Kwarg and set the arg to that value, e.g 1193 # "num_rows => 1" sets the expr's `num_rows` arg 1194 if arg: 1195 expr.set(arg.this.name, arg) 1196 1197 return expr 1198 1199 def _parse_vector_search(self) -> exp.VectorSearch: 1200 self._match(TokenType.TABLE) 1201 base_table = self._parse_table() 1202 1203 self._match(TokenType.COMMA) 1204 1205 column_to_search = self._parse_bitwise() 1206 self._match(TokenType.COMMA) 1207 1208 self._match(TokenType.TABLE) 1209 query_table = self._parse_table() 1210 1211 expr = self.expression( 1212 exp.VectorSearch, 1213 this=base_table, 1214 column_to_search=column_to_search, 1215 query_table=query_table, 1216 ) 1217 1218 while self._match(TokenType.COMMA): 1219 # query_column_to_search can be named argument or positional 1220 if self._match(TokenType.STRING, advance=False): 1221 query_column = self._parse_string() 1222 expr.set("query_column_to_search", query_column) 1223 else: 1224 arg = self._parse_lambda() 1225 if arg: 1226 expr.set(arg.this.name, arg) 1227 1228 return expr 1229 1230 def _parse_export_data(self) -> exp.Export: 1231 self._match_text_seq("DATA") 1232 1233 return self.expression( 1234 exp.Export, 1235 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1236 options=self._parse_properties(), 1237 this=self._match_text_seq("AS") and self._parse_select(), 1238 ) 1239 1240 class Generator(generator.Generator): 1241 INTERVAL_ALLOWS_PLURAL_FORM = False 1242 JOIN_HINTS = False 1243 QUERY_HINTS = False 1244 TABLE_HINTS = False 1245 LIMIT_FETCH = "LIMIT" 1246 RENAME_TABLE_WITH_DB = False 1247 NVL2_SUPPORTED = False 1248 UNNEST_WITH_ORDINALITY = False 1249 COLLATE_IS_FUNC = True 1250 LIMIT_ONLY_LITERALS = True 1251 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1252 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1253 JSON_KEY_VALUE_PAIR_SEP = "," 1254 NULL_ORDERING_SUPPORTED = False 1255 IGNORE_NULLS_IN_FUNC = True 1256 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1257 CAN_IMPLEMENT_ARRAY_ANY = True 1258 SUPPORTS_TO_NUMBER = False 1259 NAMED_PLACEHOLDER_TOKEN = "@" 1260 HEX_FUNC = "TO_HEX" 1261 WITH_PROPERTIES_PREFIX = "OPTIONS" 1262 SUPPORTS_EXPLODING_PROJECTIONS = False 1263 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1264 SUPPORTS_UNIX_SECONDS = True 1265 1266 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1267 1268 TS_OR_DS_TYPES = ( 1269 exp.TsOrDsToDatetime, 1270 exp.TsOrDsToTimestamp, 1271 exp.TsOrDsToTime, 1272 exp.TsOrDsToDate, 1273 ) 1274 1275 TRANSFORMS = { 1276 **generator.Generator.TRANSFORMS, 1277 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1278 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1279 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1280 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1281 exp.Array: inline_array_unless_query, 1282 exp.ArrayContains: _array_contains_sql, 1283 exp.ArrayFilter: filter_array_using_unnest, 1284 exp.ArrayRemove: filter_array_using_unnest, 1285 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1286 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1287 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1288 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1289 exp.ByteLength: rename_func("BYTE_LENGTH"), 1290 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1291 exp.CollateProperty: lambda self, e: ( 1292 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1293 if e.args.get("default") 1294 else f"COLLATE {self.sql(e, 'this')}" 1295 ), 1296 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1297 exp.CountIf: rename_func("COUNTIF"), 1298 exp.Create: _create_sql, 1299 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1300 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1301 exp.DateDiff: lambda self, e: self.func( 1302 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1303 ), 1304 exp.DateFromParts: rename_func("DATE"), 1305 exp.DateStrToDate: datestrtodate_sql, 1306 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1307 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1308 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1309 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1310 exp.FromTimeZone: lambda self, e: self.func( 1311 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1312 ), 1313 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1314 exp.GroupConcat: lambda self, e: groupconcat_sql( 1315 self, e, func_name="STRING_AGG", within_group=False 1316 ), 1317 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1318 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1319 exp.If: if_sql(false_value="NULL"), 1320 exp.ILike: no_ilike_sql, 1321 exp.IntDiv: rename_func("DIV"), 1322 exp.Int64: rename_func("INT64"), 1323 exp.JSONBool: rename_func("BOOL"), 1324 exp.JSONExtract: _json_extract_sql, 1325 exp.JSONExtractArray: _json_extract_sql, 1326 exp.JSONExtractScalar: _json_extract_sql, 1327 exp.JSONFormat: lambda self, e: self.func( 1328 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1329 e.this, 1330 e.args.get("options"), 1331 ), 1332 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1333 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1334 exp.Levenshtein: _levenshtein_sql, 1335 exp.Max: max_or_greatest, 1336 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1337 exp.MD5Digest: rename_func("MD5"), 1338 exp.Min: min_or_least, 1339 exp.Normalize: lambda self, e: self.func( 1340 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1341 e.this, 1342 e.args.get("form"), 1343 ), 1344 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1345 exp.RegexpExtract: lambda self, e: self.func( 1346 "REGEXP_EXTRACT", 1347 e.this, 1348 e.expression, 1349 e.args.get("position"), 1350 e.args.get("occurrence"), 1351 ), 1352 exp.RegexpExtractAll: lambda self, e: self.func( 1353 "REGEXP_EXTRACT_ALL", e.this, e.expression 1354 ), 1355 exp.RegexpReplace: regexp_replace_sql, 1356 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1357 exp.ReturnsProperty: _returnsproperty_sql, 1358 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1359 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1360 exp.ParseDatetime: lambda self, e: self.func( 1361 "PARSE_DATETIME", self.format_time(e), e.this 1362 ), 1363 exp.Select: transforms.preprocess( 1364 [ 1365 transforms.explode_projection_to_unnest(), 1366 transforms.unqualify_unnest, 1367 transforms.eliminate_distinct_on, 1368 _alias_ordered_group, 1369 transforms.eliminate_semi_and_anti_joins, 1370 ] 1371 ), 1372 exp.SHA: rename_func("SHA1"), 1373 exp.SHA2: sha256_sql, 1374 exp.StabilityProperty: lambda self, e: ( 1375 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1376 ), 1377 exp.String: rename_func("STRING"), 1378 exp.StrPosition: lambda self, e: ( 1379 strposition_sql( 1380 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1381 ) 1382 ), 1383 exp.StrToDate: _str_to_datetime_sql, 1384 exp.StrToTime: _str_to_datetime_sql, 1385 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1386 exp.TimeFromParts: rename_func("TIME"), 1387 exp.TimestampFromParts: rename_func("DATETIME"), 1388 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1389 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1390 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1391 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1392 exp.TimeStrToTime: timestrtotime_sql, 1393 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1394 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1395 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1396 exp.TsOrDsToTime: rename_func("TIME"), 1397 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1398 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1399 exp.Unhex: rename_func("FROM_HEX"), 1400 exp.UnixDate: rename_func("UNIX_DATE"), 1401 exp.UnixToTime: _unix_to_time_sql, 1402 exp.Uuid: lambda *_: "GENERATE_UUID()", 1403 exp.Values: _derived_table_values_to_unnest, 1404 exp.VariancePop: rename_func("VAR_POP"), 1405 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1406 } 1407 1408 SUPPORTED_JSON_PATH_PARTS = { 1409 exp.JSONPathKey, 1410 exp.JSONPathRoot, 1411 exp.JSONPathSubscript, 1412 } 1413 1414 TYPE_MAPPING = { 1415 **generator.Generator.TYPE_MAPPING, 1416 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1417 exp.DataType.Type.BIGINT: "INT64", 1418 exp.DataType.Type.BINARY: "BYTES", 1419 exp.DataType.Type.BLOB: "BYTES", 1420 exp.DataType.Type.BOOLEAN: "BOOL", 1421 exp.DataType.Type.CHAR: "STRING", 1422 exp.DataType.Type.DECIMAL: "NUMERIC", 1423 exp.DataType.Type.DOUBLE: "FLOAT64", 1424 exp.DataType.Type.FLOAT: "FLOAT64", 1425 exp.DataType.Type.INT: "INT64", 1426 exp.DataType.Type.NCHAR: "STRING", 1427 exp.DataType.Type.NVARCHAR: "STRING", 1428 exp.DataType.Type.SMALLINT: "INT64", 1429 exp.DataType.Type.TEXT: "STRING", 1430 exp.DataType.Type.TIMESTAMP: "DATETIME", 1431 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1432 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1433 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1434 exp.DataType.Type.TINYINT: "INT64", 1435 exp.DataType.Type.ROWVERSION: "BYTES", 1436 exp.DataType.Type.UUID: "STRING", 1437 exp.DataType.Type.VARBINARY: "BYTES", 1438 exp.DataType.Type.VARCHAR: "STRING", 1439 exp.DataType.Type.VARIANT: "ANY TYPE", 1440 } 1441 1442 PROPERTIES_LOCATION = { 1443 **generator.Generator.PROPERTIES_LOCATION, 1444 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1445 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1446 } 1447 1448 # WINDOW comes after QUALIFY 1449 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1450 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1451 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1452 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1453 } 1454 1455 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1456 RESERVED_KEYWORDS = { 1457 "all", 1458 "and", 1459 "any", 1460 "array", 1461 "as", 1462 "asc", 1463 "assert_rows_modified", 1464 "at", 1465 "between", 1466 "by", 1467 "case", 1468 "cast", 1469 "collate", 1470 "contains", 1471 "create", 1472 "cross", 1473 "cube", 1474 "current", 1475 "default", 1476 "define", 1477 "desc", 1478 "distinct", 1479 "else", 1480 "end", 1481 "enum", 1482 "escape", 1483 "except", 1484 "exclude", 1485 "exists", 1486 "extract", 1487 "false", 1488 "fetch", 1489 "following", 1490 "for", 1491 "from", 1492 "full", 1493 "group", 1494 "grouping", 1495 "groups", 1496 "hash", 1497 "having", 1498 "if", 1499 "ignore", 1500 "in", 1501 "inner", 1502 "intersect", 1503 "interval", 1504 "into", 1505 "is", 1506 "join", 1507 "lateral", 1508 "left", 1509 "like", 1510 "limit", 1511 "lookup", 1512 "merge", 1513 "natural", 1514 "new", 1515 "no", 1516 "not", 1517 "null", 1518 "nulls", 1519 "of", 1520 "on", 1521 "or", 1522 "order", 1523 "outer", 1524 "over", 1525 "partition", 1526 "preceding", 1527 "proto", 1528 "qualify", 1529 "range", 1530 "recursive", 1531 "respect", 1532 "right", 1533 "rollup", 1534 "rows", 1535 "select", 1536 "set", 1537 "some", 1538 "struct", 1539 "tablesample", 1540 "then", 1541 "to", 1542 "treat", 1543 "true", 1544 "unbounded", 1545 "union", 1546 "unnest", 1547 "using", 1548 "when", 1549 "where", 1550 "window", 1551 "with", 1552 "within", 1553 } 1554 1555 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1556 unit = expression.unit 1557 unit_sql = unit.name if unit.is_string else self.sql(unit) 1558 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1559 1560 def mod_sql(self, expression: exp.Mod) -> str: 1561 this = expression.this 1562 expr = expression.expression 1563 return self.func( 1564 "MOD", 1565 this.unnest() if isinstance(this, exp.Paren) else this, 1566 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1567 ) 1568 1569 def column_parts(self, expression: exp.Column) -> str: 1570 if expression.meta.get("quoted_column"): 1571 # If a column reference is of the form `dataset.table`.name, we need 1572 # to preserve the quoted table path, otherwise the reference breaks 1573 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1574 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1575 return f"{table_path}.{self.sql(expression, 'this')}" 1576 1577 return super().column_parts(expression) 1578 1579 def table_parts(self, expression: exp.Table) -> str: 1580 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1581 # we need to make sure the correct quoting is used in each case. 1582 # 1583 # For example, if there is a CTE x that clashes with a schema name, then the former will 1584 # return the table y in that schema, whereas the latter will return the CTE's y column: 1585 # 1586 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1587 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1588 if expression.meta.get("quoted_table"): 1589 table_parts = ".".join(p.name for p in expression.parts) 1590 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1591 1592 return super().table_parts(expression) 1593 1594 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1595 this = expression.this 1596 if isinstance(this, exp.TsOrDsToDatetime): 1597 func_name = "FORMAT_DATETIME" 1598 elif isinstance(this, exp.TsOrDsToTimestamp): 1599 func_name = "FORMAT_TIMESTAMP" 1600 elif isinstance(this, exp.TsOrDsToTime): 1601 func_name = "FORMAT_TIME" 1602 else: 1603 func_name = "FORMAT_DATE" 1604 1605 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1606 return self.func( 1607 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1608 ) 1609 1610 def eq_sql(self, expression: exp.EQ) -> str: 1611 # Operands of = cannot be NULL in BigQuery 1612 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1613 if not isinstance(expression.parent, exp.Update): 1614 return "NULL" 1615 1616 return self.binary(expression, "=") 1617 1618 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1619 parent = expression.parent 1620 1621 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1622 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1623 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1624 return self.func( 1625 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1626 ) 1627 1628 return super().attimezone_sql(expression) 1629 1630 def trycast_sql(self, expression: exp.TryCast) -> str: 1631 return self.cast_sql(expression, safe_prefix="SAFE_") 1632 1633 def bracket_sql(self, expression: exp.Bracket) -> str: 1634 this = expression.this 1635 expressions = expression.expressions 1636 1637 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1638 arg = expressions[0] 1639 if arg.type is None: 1640 from sqlglot.optimizer.annotate_types import annotate_types 1641 1642 arg = annotate_types(arg, dialect=self.dialect) 1643 1644 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1645 # BQ doesn't support bracket syntax with string values for structs 1646 return f"{self.sql(this)}.{arg.name}" 1647 1648 expressions_sql = self.expressions(expression, flat=True) 1649 offset = expression.args.get("offset") 1650 1651 if offset == 0: 1652 expressions_sql = f"OFFSET({expressions_sql})" 1653 elif offset == 1: 1654 expressions_sql = f"ORDINAL({expressions_sql})" 1655 elif offset is not None: 1656 self.unsupported(f"Unsupported array offset: {offset}") 1657 1658 if expression.args.get("safe"): 1659 expressions_sql = f"SAFE_{expressions_sql}" 1660 1661 return f"{self.sql(this)}[{expressions_sql}]" 1662 1663 def in_unnest_op(self, expression: exp.Unnest) -> str: 1664 return self.sql(expression) 1665 1666 def version_sql(self, expression: exp.Version) -> str: 1667 if expression.name == "TIMESTAMP": 1668 expression.set("this", "SYSTEM_TIME") 1669 return super().version_sql(expression) 1670 1671 def contains_sql(self, expression: exp.Contains) -> str: 1672 this = expression.this 1673 expr = expression.expression 1674 1675 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1676 this = this.this 1677 expr = expr.this 1678 1679 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1680 1681 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1682 this = expression.this 1683 1684 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1685 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1686 # because they aren't literals and so the above syntax is invalid BigQuery. 1687 if isinstance(this, exp.Array): 1688 elem = seq_get(this.expressions, 0) 1689 if not (elem and elem.find(exp.Query)): 1690 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1691 1692 return super().cast_sql(expression, safe_prefix=safe_prefix) 1693 1694 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1695 variables = self.expressions(expression, "this") 1696 default = self.sql(expression, "default") 1697 default = f" DEFAULT {default}" if default else "" 1698 kind = self.sql(expression, "kind") 1699 kind = f" {kind}" if kind else "" 1700 1701 return f"{variables}{kind}{default}"
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG function.
Possible values: True, False, None (two arguments are not supported by LOG)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy').
If empty, the corresponding trie will be constructed off of TIME_MAPPING.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT * queries.
Whether a set operation uses DISTINCT by default. This is None when either DISTINCT or ALL
must be explicitly specified.
672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO would be resolved as foo in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n) to its unescaped version (
).
701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier 759 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 760 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 761 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 762 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 763 TokenType.ASC, 764 TokenType.DESC, 765 } 766 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 767 768 FUNCTIONS = { 769 **parser.Parser.FUNCTIONS, 770 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 771 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 772 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 773 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 774 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 775 "BOOL": exp.JSONBool.from_arg_list, 776 "CONTAINS_SUBSTR": _build_contains_substring, 777 "DATE": _build_date, 778 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 779 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 780 "DATE_TRUNC": lambda args: exp.DateTrunc( 781 unit=seq_get(args, 1), 782 this=seq_get(args, 0), 783 zone=seq_get(args, 2), 784 ), 785 "DATETIME": _build_datetime, 786 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 787 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 788 "DIV": binary_from_function(exp.IntDiv), 789 "EDIT_DISTANCE": _build_levenshtein, 790 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 791 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 792 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 793 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 794 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 795 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 796 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 797 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 798 "JSON_STRIP_NULLS": _build_json_strip_nulls, 799 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 800 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 801 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 802 "MD5": exp.MD5Digest.from_arg_list, 803 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 804 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 805 ), 806 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 807 "TO_HEX": _build_to_hex, 808 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 809 [seq_get(args, 1), seq_get(args, 0)] 810 ), 811 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 812 [seq_get(args, 1), seq_get(args, 0)] 813 ), 814 "PARSE_TIMESTAMP": _build_parse_timestamp, 815 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 816 [seq_get(args, 1), seq_get(args, 0)] 817 ), 818 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 819 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 820 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 821 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 822 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 823 ), 824 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 825 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 826 "SPLIT": lambda args: exp.Split( 827 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 828 this=seq_get(args, 0), 829 expression=seq_get(args, 1) or exp.Literal.string(","), 830 ), 831 "STRPOS": exp.StrPosition.from_arg_list, 832 "TIME": _build_time, 833 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 834 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 835 "TIMESTAMP": _build_timestamp, 836 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 837 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 838 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 839 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 840 ), 841 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 842 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 843 ), 844 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 845 "TO_JSON": lambda args: exp.JSONFormat( 846 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 847 ), 848 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 849 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 850 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 851 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 852 "FROM_HEX": exp.Unhex.from_arg_list, 853 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 854 } 855 856 FUNCTION_PARSERS = { 857 **parser.Parser.FUNCTION_PARSERS, 858 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 859 "JSON_ARRAY": lambda self: self.expression( 860 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 861 ), 862 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 863 "PREDICT": lambda self: self._parse_ml(exp.Predict), 864 "TRANSLATE": lambda self: self._parse_translate(), 865 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 866 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 867 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 868 exp.GenerateEmbedding, is_text=True 869 ), 870 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 871 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 872 } 873 FUNCTION_PARSERS.pop("TRIM") 874 875 NO_PAREN_FUNCTIONS = { 876 **parser.Parser.NO_PAREN_FUNCTIONS, 877 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 878 } 879 880 NESTED_TYPE_TOKENS = { 881 *parser.Parser.NESTED_TYPE_TOKENS, 882 TokenType.TABLE, 883 } 884 885 PROPERTY_PARSERS = { 886 **parser.Parser.PROPERTY_PARSERS, 887 "NOT DETERMINISTIC": lambda self: self.expression( 888 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 889 ), 890 "OPTIONS": lambda self: self._parse_with_property(), 891 } 892 893 CONSTRAINT_PARSERS = { 894 **parser.Parser.CONSTRAINT_PARSERS, 895 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 896 } 897 898 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 899 RANGE_PARSERS.pop(TokenType.OVERLAPS) 900 901 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 902 903 STATEMENT_PARSERS = { 904 **parser.Parser.STATEMENT_PARSERS, 905 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 906 TokenType.END: lambda self: self._parse_as_command(self._prev), 907 TokenType.FOR: lambda self: self._parse_for_in(), 908 TokenType.EXPORT: lambda self: self._parse_export_data(), 909 TokenType.DECLARE: lambda self: self._parse_declare(), 910 } 911 912 BRACKET_OFFSETS = { 913 "OFFSET": (0, False), 914 "ORDINAL": (1, False), 915 "SAFE_OFFSET": (0, True), 916 "SAFE_ORDINAL": (1, True), 917 } 918 919 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 920 index = self._index 921 this = self._parse_range() 922 self._match_text_seq("DO") 923 if self._match(TokenType.COMMAND): 924 self._retreat(index) 925 return self._parse_as_command(self._prev) 926 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 927 928 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 929 this = super()._parse_table_part(schema=schema) or self._parse_number() 930 931 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 932 if isinstance(this, exp.Identifier): 933 table_name = this.name 934 while self._match(TokenType.DASH, advance=False) and self._next: 935 start = self._curr 936 while self._is_connected() and not self._match_set( 937 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 938 ): 939 self._advance() 940 941 if start == self._curr: 942 break 943 944 table_name += self._find_sql(start, self._prev) 945 946 this = exp.Identifier( 947 this=table_name, quoted=this.args.get("quoted") 948 ).update_positions(this) 949 elif isinstance(this, exp.Literal): 950 table_name = this.name 951 952 if self._is_connected() and self._parse_var(any_token=True): 953 table_name += self._prev.text 954 955 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 956 957 return this 958 959 def _parse_table_parts( 960 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 961 ) -> exp.Table: 962 table = super()._parse_table_parts( 963 schema=schema, is_db_reference=is_db_reference, wildcard=True 964 ) 965 966 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 967 if not table.catalog: 968 if table.db: 969 previous_db = table.args["db"] 970 parts = table.db.split(".") 971 if len(parts) == 2 and not table.args["db"].quoted: 972 table.set( 973 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 974 ) 975 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 976 else: 977 previous_this = table.this 978 parts = table.name.split(".") 979 if len(parts) == 2 and not table.this.quoted: 980 table.set( 981 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 982 ) 983 table.set( 984 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 985 ) 986 987 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 988 alias = table.this 989 catalog, db, this, *rest = ( 990 exp.to_identifier(p, quoted=True) 991 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 992 ) 993 994 for part in (catalog, db, this): 995 if part: 996 part.update_positions(table.this) 997 998 if rest and this: 999 this = exp.Dot.build([this, *rest]) # type: ignore 1000 1001 table = exp.Table( 1002 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1003 ) 1004 table.meta["quoted_table"] = True 1005 else: 1006 alias = None 1007 1008 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1009 # dataset, so if the project identifier is omitted we need to fix the ast so that 1010 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1011 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1012 # views, because it would seem like the "catalog" part is set, when it'd actually 1013 # be the region/dataset. Merging the two identifiers into a single one is done to 1014 # avoid producing a 4-part Table reference, which would cause issues in the schema 1015 # module, when there are 3-part table names mixed with information schema views. 1016 # 1017 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1018 table_parts = table.parts 1019 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1020 # We need to alias the table here to avoid breaking existing qualified columns. 1021 # This is expected to be safe, because if there's an actual alias coming up in 1022 # the token stream, it will overwrite this one. If there isn't one, we are only 1023 # exposing the name that can be used to reference the view explicitly (a no-op). 1024 exp.alias_( 1025 table, 1026 t.cast(exp.Identifier, alias or table_parts[-1]), 1027 table=True, 1028 copy=False, 1029 ) 1030 1031 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1032 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1033 line=table_parts[-2].meta.get("line"), 1034 col=table_parts[-1].meta.get("col"), 1035 start=table_parts[-2].meta.get("start"), 1036 end=table_parts[-1].meta.get("end"), 1037 ) 1038 table.set("this", new_this) 1039 table.set("db", seq_get(table_parts, -3)) 1040 table.set("catalog", seq_get(table_parts, -4)) 1041 1042 return table 1043 1044 def _parse_column(self) -> t.Optional[exp.Expression]: 1045 column = super()._parse_column() 1046 if isinstance(column, exp.Column): 1047 parts = column.parts 1048 if any("." in p.name for p in parts): 1049 catalog, db, table, this, *rest = ( 1050 exp.to_identifier(p, quoted=True) 1051 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1052 ) 1053 1054 if rest and this: 1055 this = exp.Dot.build([this, *rest]) # type: ignore 1056 1057 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1058 column.meta["quoted_column"] = True 1059 1060 return column 1061 1062 @t.overload 1063 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1064 1065 @t.overload 1066 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1067 1068 def _parse_json_object(self, agg=False): 1069 json_object = super()._parse_json_object() 1070 array_kv_pair = seq_get(json_object.expressions, 0) 1071 1072 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1073 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1074 if ( 1075 array_kv_pair 1076 and isinstance(array_kv_pair.this, exp.Array) 1077 and isinstance(array_kv_pair.expression, exp.Array) 1078 ): 1079 keys = array_kv_pair.this.expressions 1080 values = array_kv_pair.expression.expressions 1081 1082 json_object.set( 1083 "expressions", 1084 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1085 ) 1086 1087 return json_object 1088 1089 def _parse_bracket( 1090 self, this: t.Optional[exp.Expression] = None 1091 ) -> t.Optional[exp.Expression]: 1092 bracket = super()._parse_bracket(this) 1093 1094 if this is bracket: 1095 return bracket 1096 1097 if isinstance(bracket, exp.Bracket): 1098 for expression in bracket.expressions: 1099 name = expression.name.upper() 1100 1101 if name not in self.BRACKET_OFFSETS: 1102 break 1103 1104 offset, safe = self.BRACKET_OFFSETS[name] 1105 bracket.set("offset", offset) 1106 bracket.set("safe", safe) 1107 expression.replace(expression.expressions[0]) 1108 1109 return bracket 1110 1111 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1112 unnest = super()._parse_unnest(with_alias=with_alias) 1113 1114 if not unnest: 1115 return None 1116 1117 unnest_expr = seq_get(unnest.expressions, 0) 1118 if unnest_expr: 1119 from sqlglot.optimizer.annotate_types import annotate_types 1120 1121 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1122 1123 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1124 # in contrast to other dialects such as DuckDB which flattens only the array by default 1125 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1126 array_elem.is_type(exp.DataType.Type.STRUCT) 1127 for array_elem in unnest_expr._type.expressions 1128 ): 1129 unnest.set("explode_array", True) 1130 1131 return unnest 1132 1133 def _parse_make_interval(self) -> exp.MakeInterval: 1134 expr = exp.MakeInterval() 1135 1136 for arg_key in expr.arg_types: 1137 value = self._parse_lambda() 1138 1139 if not value: 1140 break 1141 1142 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1143 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1144 if isinstance(value, exp.Kwarg): 1145 arg_key = value.this.name 1146 1147 expr.set(arg_key, value) 1148 1149 self._match(TokenType.COMMA) 1150 1151 return expr 1152 1153 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1154 self._match_text_seq("MODEL") 1155 this = self._parse_table() 1156 1157 self._match(TokenType.COMMA) 1158 self._match_text_seq("TABLE") 1159 1160 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1161 expression = ( 1162 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1163 ) 1164 1165 self._match(TokenType.COMMA) 1166 1167 return self.expression( 1168 expr_type, 1169 this=this, 1170 expression=expression, 1171 params_struct=self._parse_bitwise(), 1172 **kwargs, 1173 ) 1174 1175 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1176 # Check if this is ML.TRANSLATE by looking at previous tokens 1177 token = seq_get(self._tokens, self._index - 4) 1178 if token and token.text.upper() == "ML": 1179 return self._parse_ml(exp.MLTranslate) 1180 1181 return exp.Translate.from_arg_list(self._parse_function_args()) 1182 1183 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1184 self._match(TokenType.TABLE) 1185 this = self._parse_table() 1186 1187 expr = self.expression(exp.FeaturesAtTime, this=this) 1188 1189 while self._match(TokenType.COMMA): 1190 arg = self._parse_lambda() 1191 1192 # Get the LHS of the Kwarg and set the arg to that value, e.g 1193 # "num_rows => 1" sets the expr's `num_rows` arg 1194 if arg: 1195 expr.set(arg.this.name, arg) 1196 1197 return expr 1198 1199 def _parse_vector_search(self) -> exp.VectorSearch: 1200 self._match(TokenType.TABLE) 1201 base_table = self._parse_table() 1202 1203 self._match(TokenType.COMMA) 1204 1205 column_to_search = self._parse_bitwise() 1206 self._match(TokenType.COMMA) 1207 1208 self._match(TokenType.TABLE) 1209 query_table = self._parse_table() 1210 1211 expr = self.expression( 1212 exp.VectorSearch, 1213 this=base_table, 1214 column_to_search=column_to_search, 1215 query_table=query_table, 1216 ) 1217 1218 while self._match(TokenType.COMMA): 1219 # query_column_to_search can be named argument or positional 1220 if self._match(TokenType.STRING, advance=False): 1221 query_column = self._parse_string() 1222 expr.set("query_column_to_search", query_column) 1223 else: 1224 arg = self._parse_lambda() 1225 if arg: 1226 expr.set(arg.this.name, arg) 1227 1228 return expr 1229 1230 def _parse_export_data(self) -> exp.Export: 1231 self._match_text_seq("DATA") 1232 1233 return self.expression( 1234 exp.Export, 1235 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1236 options=self._parse_properties(), 1237 this=self._match_text_seq("AS") and self._parse_select(), 1238 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- SUPPORTS_OMITTED_INTERVAL_SPAN_UNIT
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
1240 class Generator(generator.Generator): 1241 INTERVAL_ALLOWS_PLURAL_FORM = False 1242 JOIN_HINTS = False 1243 QUERY_HINTS = False 1244 TABLE_HINTS = False 1245 LIMIT_FETCH = "LIMIT" 1246 RENAME_TABLE_WITH_DB = False 1247 NVL2_SUPPORTED = False 1248 UNNEST_WITH_ORDINALITY = False 1249 COLLATE_IS_FUNC = True 1250 LIMIT_ONLY_LITERALS = True 1251 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1252 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1253 JSON_KEY_VALUE_PAIR_SEP = "," 1254 NULL_ORDERING_SUPPORTED = False 1255 IGNORE_NULLS_IN_FUNC = True 1256 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1257 CAN_IMPLEMENT_ARRAY_ANY = True 1258 SUPPORTS_TO_NUMBER = False 1259 NAMED_PLACEHOLDER_TOKEN = "@" 1260 HEX_FUNC = "TO_HEX" 1261 WITH_PROPERTIES_PREFIX = "OPTIONS" 1262 SUPPORTS_EXPLODING_PROJECTIONS = False 1263 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1264 SUPPORTS_UNIX_SECONDS = True 1265 1266 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1267 1268 TS_OR_DS_TYPES = ( 1269 exp.TsOrDsToDatetime, 1270 exp.TsOrDsToTimestamp, 1271 exp.TsOrDsToTime, 1272 exp.TsOrDsToDate, 1273 ) 1274 1275 TRANSFORMS = { 1276 **generator.Generator.TRANSFORMS, 1277 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1278 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1279 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1280 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1281 exp.Array: inline_array_unless_query, 1282 exp.ArrayContains: _array_contains_sql, 1283 exp.ArrayFilter: filter_array_using_unnest, 1284 exp.ArrayRemove: filter_array_using_unnest, 1285 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1286 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1287 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1288 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1289 exp.ByteLength: rename_func("BYTE_LENGTH"), 1290 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1291 exp.CollateProperty: lambda self, e: ( 1292 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1293 if e.args.get("default") 1294 else f"COLLATE {self.sql(e, 'this')}" 1295 ), 1296 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1297 exp.CountIf: rename_func("COUNTIF"), 1298 exp.Create: _create_sql, 1299 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1300 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1301 exp.DateDiff: lambda self, e: self.func( 1302 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1303 ), 1304 exp.DateFromParts: rename_func("DATE"), 1305 exp.DateStrToDate: datestrtodate_sql, 1306 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1307 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1308 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1309 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1310 exp.FromTimeZone: lambda self, e: self.func( 1311 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1312 ), 1313 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1314 exp.GroupConcat: lambda self, e: groupconcat_sql( 1315 self, e, func_name="STRING_AGG", within_group=False 1316 ), 1317 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1318 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1319 exp.If: if_sql(false_value="NULL"), 1320 exp.ILike: no_ilike_sql, 1321 exp.IntDiv: rename_func("DIV"), 1322 exp.Int64: rename_func("INT64"), 1323 exp.JSONBool: rename_func("BOOL"), 1324 exp.JSONExtract: _json_extract_sql, 1325 exp.JSONExtractArray: _json_extract_sql, 1326 exp.JSONExtractScalar: _json_extract_sql, 1327 exp.JSONFormat: lambda self, e: self.func( 1328 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1329 e.this, 1330 e.args.get("options"), 1331 ), 1332 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1333 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1334 exp.Levenshtein: _levenshtein_sql, 1335 exp.Max: max_or_greatest, 1336 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1337 exp.MD5Digest: rename_func("MD5"), 1338 exp.Min: min_or_least, 1339 exp.Normalize: lambda self, e: self.func( 1340 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1341 e.this, 1342 e.args.get("form"), 1343 ), 1344 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1345 exp.RegexpExtract: lambda self, e: self.func( 1346 "REGEXP_EXTRACT", 1347 e.this, 1348 e.expression, 1349 e.args.get("position"), 1350 e.args.get("occurrence"), 1351 ), 1352 exp.RegexpExtractAll: lambda self, e: self.func( 1353 "REGEXP_EXTRACT_ALL", e.this, e.expression 1354 ), 1355 exp.RegexpReplace: regexp_replace_sql, 1356 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1357 exp.ReturnsProperty: _returnsproperty_sql, 1358 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1359 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1360 exp.ParseDatetime: lambda self, e: self.func( 1361 "PARSE_DATETIME", self.format_time(e), e.this 1362 ), 1363 exp.Select: transforms.preprocess( 1364 [ 1365 transforms.explode_projection_to_unnest(), 1366 transforms.unqualify_unnest, 1367 transforms.eliminate_distinct_on, 1368 _alias_ordered_group, 1369 transforms.eliminate_semi_and_anti_joins, 1370 ] 1371 ), 1372 exp.SHA: rename_func("SHA1"), 1373 exp.SHA2: sha256_sql, 1374 exp.StabilityProperty: lambda self, e: ( 1375 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1376 ), 1377 exp.String: rename_func("STRING"), 1378 exp.StrPosition: lambda self, e: ( 1379 strposition_sql( 1380 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1381 ) 1382 ), 1383 exp.StrToDate: _str_to_datetime_sql, 1384 exp.StrToTime: _str_to_datetime_sql, 1385 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1386 exp.TimeFromParts: rename_func("TIME"), 1387 exp.TimestampFromParts: rename_func("DATETIME"), 1388 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1389 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1390 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1391 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1392 exp.TimeStrToTime: timestrtotime_sql, 1393 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1394 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1395 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1396 exp.TsOrDsToTime: rename_func("TIME"), 1397 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1398 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1399 exp.Unhex: rename_func("FROM_HEX"), 1400 exp.UnixDate: rename_func("UNIX_DATE"), 1401 exp.UnixToTime: _unix_to_time_sql, 1402 exp.Uuid: lambda *_: "GENERATE_UUID()", 1403 exp.Values: _derived_table_values_to_unnest, 1404 exp.VariancePop: rename_func("VAR_POP"), 1405 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1406 } 1407 1408 SUPPORTED_JSON_PATH_PARTS = { 1409 exp.JSONPathKey, 1410 exp.JSONPathRoot, 1411 exp.JSONPathSubscript, 1412 } 1413 1414 TYPE_MAPPING = { 1415 **generator.Generator.TYPE_MAPPING, 1416 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1417 exp.DataType.Type.BIGINT: "INT64", 1418 exp.DataType.Type.BINARY: "BYTES", 1419 exp.DataType.Type.BLOB: "BYTES", 1420 exp.DataType.Type.BOOLEAN: "BOOL", 1421 exp.DataType.Type.CHAR: "STRING", 1422 exp.DataType.Type.DECIMAL: "NUMERIC", 1423 exp.DataType.Type.DOUBLE: "FLOAT64", 1424 exp.DataType.Type.FLOAT: "FLOAT64", 1425 exp.DataType.Type.INT: "INT64", 1426 exp.DataType.Type.NCHAR: "STRING", 1427 exp.DataType.Type.NVARCHAR: "STRING", 1428 exp.DataType.Type.SMALLINT: "INT64", 1429 exp.DataType.Type.TEXT: "STRING", 1430 exp.DataType.Type.TIMESTAMP: "DATETIME", 1431 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1432 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1433 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1434 exp.DataType.Type.TINYINT: "INT64", 1435 exp.DataType.Type.ROWVERSION: "BYTES", 1436 exp.DataType.Type.UUID: "STRING", 1437 exp.DataType.Type.VARBINARY: "BYTES", 1438 exp.DataType.Type.VARCHAR: "STRING", 1439 exp.DataType.Type.VARIANT: "ANY TYPE", 1440 } 1441 1442 PROPERTIES_LOCATION = { 1443 **generator.Generator.PROPERTIES_LOCATION, 1444 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1445 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1446 } 1447 1448 # WINDOW comes after QUALIFY 1449 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1450 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1451 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1452 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1453 } 1454 1455 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1456 RESERVED_KEYWORDS = { 1457 "all", 1458 "and", 1459 "any", 1460 "array", 1461 "as", 1462 "asc", 1463 "assert_rows_modified", 1464 "at", 1465 "between", 1466 "by", 1467 "case", 1468 "cast", 1469 "collate", 1470 "contains", 1471 "create", 1472 "cross", 1473 "cube", 1474 "current", 1475 "default", 1476 "define", 1477 "desc", 1478 "distinct", 1479 "else", 1480 "end", 1481 "enum", 1482 "escape", 1483 "except", 1484 "exclude", 1485 "exists", 1486 "extract", 1487 "false", 1488 "fetch", 1489 "following", 1490 "for", 1491 "from", 1492 "full", 1493 "group", 1494 "grouping", 1495 "groups", 1496 "hash", 1497 "having", 1498 "if", 1499 "ignore", 1500 "in", 1501 "inner", 1502 "intersect", 1503 "interval", 1504 "into", 1505 "is", 1506 "join", 1507 "lateral", 1508 "left", 1509 "like", 1510 "limit", 1511 "lookup", 1512 "merge", 1513 "natural", 1514 "new", 1515 "no", 1516 "not", 1517 "null", 1518 "nulls", 1519 "of", 1520 "on", 1521 "or", 1522 "order", 1523 "outer", 1524 "over", 1525 "partition", 1526 "preceding", 1527 "proto", 1528 "qualify", 1529 "range", 1530 "recursive", 1531 "respect", 1532 "right", 1533 "rollup", 1534 "rows", 1535 "select", 1536 "set", 1537 "some", 1538 "struct", 1539 "tablesample", 1540 "then", 1541 "to", 1542 "treat", 1543 "true", 1544 "unbounded", 1545 "union", 1546 "unnest", 1547 "using", 1548 "when", 1549 "where", 1550 "window", 1551 "with", 1552 "within", 1553 } 1554 1555 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1556 unit = expression.unit 1557 unit_sql = unit.name if unit.is_string else self.sql(unit) 1558 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1559 1560 def mod_sql(self, expression: exp.Mod) -> str: 1561 this = expression.this 1562 expr = expression.expression 1563 return self.func( 1564 "MOD", 1565 this.unnest() if isinstance(this, exp.Paren) else this, 1566 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1567 ) 1568 1569 def column_parts(self, expression: exp.Column) -> str: 1570 if expression.meta.get("quoted_column"): 1571 # If a column reference is of the form `dataset.table`.name, we need 1572 # to preserve the quoted table path, otherwise the reference breaks 1573 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1574 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1575 return f"{table_path}.{self.sql(expression, 'this')}" 1576 1577 return super().column_parts(expression) 1578 1579 def table_parts(self, expression: exp.Table) -> str: 1580 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1581 # we need to make sure the correct quoting is used in each case. 1582 # 1583 # For example, if there is a CTE x that clashes with a schema name, then the former will 1584 # return the table y in that schema, whereas the latter will return the CTE's y column: 1585 # 1586 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1587 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1588 if expression.meta.get("quoted_table"): 1589 table_parts = ".".join(p.name for p in expression.parts) 1590 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1591 1592 return super().table_parts(expression) 1593 1594 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1595 this = expression.this 1596 if isinstance(this, exp.TsOrDsToDatetime): 1597 func_name = "FORMAT_DATETIME" 1598 elif isinstance(this, exp.TsOrDsToTimestamp): 1599 func_name = "FORMAT_TIMESTAMP" 1600 elif isinstance(this, exp.TsOrDsToTime): 1601 func_name = "FORMAT_TIME" 1602 else: 1603 func_name = "FORMAT_DATE" 1604 1605 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1606 return self.func( 1607 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1608 ) 1609 1610 def eq_sql(self, expression: exp.EQ) -> str: 1611 # Operands of = cannot be NULL in BigQuery 1612 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1613 if not isinstance(expression.parent, exp.Update): 1614 return "NULL" 1615 1616 return self.binary(expression, "=") 1617 1618 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1619 parent = expression.parent 1620 1621 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1622 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1623 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1624 return self.func( 1625 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1626 ) 1627 1628 return super().attimezone_sql(expression) 1629 1630 def trycast_sql(self, expression: exp.TryCast) -> str: 1631 return self.cast_sql(expression, safe_prefix="SAFE_") 1632 1633 def bracket_sql(self, expression: exp.Bracket) -> str: 1634 this = expression.this 1635 expressions = expression.expressions 1636 1637 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1638 arg = expressions[0] 1639 if arg.type is None: 1640 from sqlglot.optimizer.annotate_types import annotate_types 1641 1642 arg = annotate_types(arg, dialect=self.dialect) 1643 1644 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1645 # BQ doesn't support bracket syntax with string values for structs 1646 return f"{self.sql(this)}.{arg.name}" 1647 1648 expressions_sql = self.expressions(expression, flat=True) 1649 offset = expression.args.get("offset") 1650 1651 if offset == 0: 1652 expressions_sql = f"OFFSET({expressions_sql})" 1653 elif offset == 1: 1654 expressions_sql = f"ORDINAL({expressions_sql})" 1655 elif offset is not None: 1656 self.unsupported(f"Unsupported array offset: {offset}") 1657 1658 if expression.args.get("safe"): 1659 expressions_sql = f"SAFE_{expressions_sql}" 1660 1661 return f"{self.sql(this)}[{expressions_sql}]" 1662 1663 def in_unnest_op(self, expression: exp.Unnest) -> str: 1664 return self.sql(expression) 1665 1666 def version_sql(self, expression: exp.Version) -> str: 1667 if expression.name == "TIMESTAMP": 1668 expression.set("this", "SYSTEM_TIME") 1669 return super().version_sql(expression) 1670 1671 def contains_sql(self, expression: exp.Contains) -> str: 1672 this = expression.this 1673 expr = expression.expression 1674 1675 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1676 this = this.this 1677 expr = expr.this 1678 1679 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1680 1681 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1682 this = expression.this 1683 1684 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1685 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1686 # because they aren't literals and so the above syntax is invalid BigQuery. 1687 if isinstance(this, exp.Array): 1688 elem = seq_get(this.expressions, 0) 1689 if not (elem and elem.find(exp.Query)): 1690 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1691 1692 return super().cast_sql(expression, safe_prefix=safe_prefix) 1693 1694 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1695 variables = self.expressions(expression, "this") 1696 default = self.sql(expression, "default") 1697 default = f" DEFAULT {default}" if default else "" 1698 kind = self.sql(expression, "kind") 1699 kind = f" {kind}" if kind else "" 1700 1701 return f"{variables}{kind}{default}"
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1569 def column_parts(self, expression: exp.Column) -> str: 1570 if expression.meta.get("quoted_column"): 1571 # If a column reference is of the form `dataset.table`.name, we need 1572 # to preserve the quoted table path, otherwise the reference breaks 1573 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1574 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1575 return f"{table_path}.{self.sql(expression, 'this')}" 1576 1577 return super().column_parts(expression)
1579 def table_parts(self, expression: exp.Table) -> str: 1580 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1581 # we need to make sure the correct quoting is used in each case. 1582 # 1583 # For example, if there is a CTE x that clashes with a schema name, then the former will 1584 # return the table y in that schema, whereas the latter will return the CTE's y column: 1585 # 1586 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1587 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1588 if expression.meta.get("quoted_table"): 1589 table_parts = ".".join(p.name for p in expression.parts) 1590 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1591 1592 return super().table_parts(expression)
1594 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1595 this = expression.this 1596 if isinstance(this, exp.TsOrDsToDatetime): 1597 func_name = "FORMAT_DATETIME" 1598 elif isinstance(this, exp.TsOrDsToTimestamp): 1599 func_name = "FORMAT_TIMESTAMP" 1600 elif isinstance(this, exp.TsOrDsToTime): 1601 func_name = "FORMAT_TIME" 1602 else: 1603 func_name = "FORMAT_DATE" 1604 1605 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1606 return self.func( 1607 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1608 )
1610 def eq_sql(self, expression: exp.EQ) -> str: 1611 # Operands of = cannot be NULL in BigQuery 1612 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1613 if not isinstance(expression.parent, exp.Update): 1614 return "NULL" 1615 1616 return self.binary(expression, "=")
1618 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1619 parent = expression.parent 1620 1621 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1622 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1623 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1624 return self.func( 1625 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1626 ) 1627 1628 return super().attimezone_sql(expression)
1633 def bracket_sql(self, expression: exp.Bracket) -> str: 1634 this = expression.this 1635 expressions = expression.expressions 1636 1637 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1638 arg = expressions[0] 1639 if arg.type is None: 1640 from sqlglot.optimizer.annotate_types import annotate_types 1641 1642 arg = annotate_types(arg, dialect=self.dialect) 1643 1644 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1645 # BQ doesn't support bracket syntax with string values for structs 1646 return f"{self.sql(this)}.{arg.name}" 1647 1648 expressions_sql = self.expressions(expression, flat=True) 1649 offset = expression.args.get("offset") 1650 1651 if offset == 0: 1652 expressions_sql = f"OFFSET({expressions_sql})" 1653 elif offset == 1: 1654 expressions_sql = f"ORDINAL({expressions_sql})" 1655 elif offset is not None: 1656 self.unsupported(f"Unsupported array offset: {offset}") 1657 1658 if expression.args.get("safe"): 1659 expressions_sql = f"SAFE_{expressions_sql}" 1660 1661 return f"{self.sql(this)}[{expressions_sql}]"
1671 def contains_sql(self, expression: exp.Contains) -> str: 1672 this = expression.this 1673 expr = expression.expression 1674 1675 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1676 this = this.this 1677 expr = expr.this 1678 1679 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope"))
1681 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1682 this = expression.this 1683 1684 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1685 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1686 # because they aren't literals and so the above syntax is invalid BigQuery. 1687 if isinstance(this, exp.Array): 1688 elem = seq_get(this.expressions, 0) 1689 if not (elem and elem.find(exp.Query)): 1690 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1691 1692 return super().cast_sql(expression, safe_prefix=safe_prefix)
1694 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1695 variables = self.expressions(expression, "this") 1696 default = self.sql(expression, "default") 1697 default = f" DEFAULT {default}" if default else "" 1698 kind = self.sql(expression, "kind") 1699 kind = f" {kind}" if kind else "" 1700 1701 return f"{variables}{kind}{default}"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- install_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql