# Enable YAML schema in IDE
# yaml-language-server: $schema=http://napkin-public-storage-bucket.s3-website-us-east-1.amazonaws.com/schema/schema.json
# We don't allow extra fields except for top-level fields that start with _.
# They may be useful when YAML anchors are used for code reuse.
_commonOptions: &common
some_common: options
## Application name
app_name: foo bar
## Metadata configuration
# Napkin maintains a database with the history of runs. It's used to estimate pipeline runtime as well as to enforce update strategies.
metadata_url: sqlite:metadata.db # optional, by default metadata will be stored in the Sqlite file in $PWD/data directory.
# metadata_url: postgresql://db@host/napkin_metadata # metadata can be also stored in Postgres
#### Database connection configuration
## BigQuery
backend: BigQuery
db_url: bigquery://bigquery.googleapis.com/project-acme?dataset=acme-analytics
backend_options: # backend specific options
labels: # can be used to segregate costs in Google Cloud billing dashboard
client: acme
repo: client-acme
concurrent_queries: 100
timeout: 300
## Postgres
# backend: Postgres
# db_url: postgresql://user@db/napkin_db
# backend_options: # backend specific options
# connection_pool: 100
## Sqlite
# backend: Sqlite
# db_url: sqlite:acme.db
# backend_options: {} # no backend options for Sqlite
## Location of SQL files when referenced in Spec.
sql_folder: ../sql # optional, this is a default value
## SQL dialect to use when parsing queries
parser_dialect: napkin.bigquery
# napkin.* family of dialects will parse SQL then convert to Napkin intermediate representation.
# This enables Napkin optimization features, however some SQL features are not available.
# Queries parsed that way will be rendered accordingly to db backend syntax – this can be used to convert queries (e.g. parse as BigQuery, run with Postgres).
# parser_dialect: napkin.bigquery
# parser_dialect: napkin.postgres
# parser_dialect: napkin.sqlite
# parser_dialect: napkin.ansi2011
# generic.* family of dialects will parse SQL, but will not convert into Napkin intermediate representation.
# This provides is provides more compatibility with SQL features.
# Queries parsed that way will be rendered accordingly to db backend dialect – this can be used to convert queries (e.g. parse as BigQuery, run with Postgres).
# parser_dialect: generic.bigquery
# parser_dialect: generic.postgres
# parser_dialect: generic.sqlite
# parser_dialect: generic.ansi2011
# Use parser based on Postgres SQL grammar.
# Postgres-specific features (such as ->> JSON operator) can be used with in this mode.
# Queries parsed that way will be always rendered according to Postgres dialect.
# parser_dialect: postgres
# Consider queries as a plain text. Use this only if other fail.
# WARNING: dependency discovery as well as table renaming will not work with raw queries
# parser_dialect: raw
## Location of Haskell files when referenced in Spec.
haskell_folder: ../haskell # optional, this is a default value
## Meta-programming & scripting configuration
## List of Haskell modules that should be imported (in a qualified way) into the Mustache templating environment.
# You don't need to list modules used for table specs, hooks, preprocessors, etc. here.
haskell_modules: # optional, defaults to empty
- ACME.Mustache # Haskell module
## Haskell function that provides function macros that can extend SQL.
function_macros: ACME.macros # optional, Haskell symbol, defaults to none
## Custom validator
validator: ACME.validator # optional, Haskell symbol, defaults to none
## Haskell spec
# In addition to specifying tables in YAML, tables can be also specified in the Haskell program with Napkin DSL.
haskell_spec: ACME.spec # optional, Haskell symbol, defaults to none
## Default extensions for Haskell interpreter
haskell_default_language_extensions:
- OverlappingInstances
- NoOverloadedStrings # extensions enabled by default can be disabled as well
## List of extra packages to be made available in the Haskell scripting
# Note: these extensions need to be present in a package DB. Please consult the user manual for directions on how to add extra packages.
haskell_packages: # optional, defaults to empty
- fmt
- yaml
## Spec preprocessors
preprocessors: # defaults to empty
# Prefix all tables (according to configuration) without hardcoding the prefix directly in the Spec and queries
- table_prefix:
# one or both of value and override_with_arg need to be specified
value: development # prefix
override_with_arg: environment # allow to override with CLI, e.g. --arg environment=production
separator: _ # defaults to empty,
# scope: unmanaged
# scope: all
# only: [foo, bar, baz] # apply renamer only to selected tables
# except: [foo, bar, baz] # apply renamer to all tables except selected tables
# Move all tables (according to configuration) to dataset/schema without specifying it directly in the Spec and queries
- table_namespace:
# one or both of value and override_with_arg need to be specified
value: development # namespace
override_with_arg: environment # allow to override with CLI, e.g. --arg environment=production
scope: managed
# scope: unmanaged
# scope: all
only: [foo, bar, baz] # apply renamer only to selected tables
except: [foo, bar, baz] # apply renamer to all tables except selected tables
on_existing: overwrite # default, replace namespace even if it was specified explicitly
# on_existing: keep_original # keep original namespace if it has been specified explicitly in the Spec
# - autodiscover # autodiscover sql files in sql_dir and add them as trivial tables
# Custom preprocessor with some arguments (Haskell symbol)
- ACME.preprocessor:
some: arguments
# Custom preprocessor without arguments (Haskell symbol)
- ACME.preprocessor
## Tables that are managed by Napkin
tables:
some_table: # Table name
# Defines a method the table will be created with
create_action:
# This table should be created with a SQL query stored in a file
sql_file:
source: some_query.sql # SQL file location relative to sql_dir
target_type:
table: # default
# Backend specific table options, optional
partitioning: # Big Query options
column: partitioning_column
range:
start: 1
end: 100
step: 2
write_disposition: append
clustering: [clustering_column]
# target_type: view # create view instead of table
# target_type: materialized_view # create view instead of table
# target_type: # backend specific materialized view options
# materialized_view:
# timescale:
# continuous:
# create_group_indexes: false
# materialized_only: false
# aggregate_policy:
# start_offset: 3 hours
# end_offset: 1 hour
# schedule_interval: 1 hour
# retention_policy: 1 year
vars: # Variables for Mustache templating, defaults to none
foo: bar
baz: boo
# What should trigger table being updated or recreated, optional, defaults to: always
update_strategy:
- type: always # Update table always
- type: periodically # Update table periodically (every hour = 3600 seconds). Requires metadata to be persisted between Napkin runs
period: 3600
- type: with_dependency # Update table when it's older than its dependency or the dependency will update as well during this run
- type: if_missing # Update table when it does not exist
# List of tables that are not referenced in the query, but should be considered as a dependency when scheduling, optional, defaults to empty
# Typically, Napkin will discover all dependencies automatically, so it does not have to be used. You can double-check dependencies discovered with dump command
deps:
- some_dependency
# List of tables that are referenced in the query and should not be considered as a dependency when scheduling, optional, defaults to empty
# Typically, Napkin will discover all dependencies automatically, so it does not have to be used. You can double-check dependencies discovered with dump command
hidden_deps:
- ignored_dependency
# Parser dialect to use for parsing queries for that table. See comments on the global attribute.
parser_dialect: napkin.bigquery
# Table tags, can be used to selectively enable or disable table execution with CLI flags during development
tags: [foo, bar, baz]
## Pre-hooks can be used to validate input data
# See examples below what assertions are built-in
pre_hooks: # optional, defaults to empty
- assert_unique:
table: source_table
columns: [some_column]
## Post-hooks can be used to validate query results
# The same set of assertions can be used both in pre and post hooks
# See examples below what assertions are built-in
post_hooks: # optional, defaults to empty
- assert_unique:
table: some_table
columns: [result_column]
# on_failure controls how the hook failures are handled, this attribute applies to all hooks
on_failure: fail_now # default, fail the whole spec as soon hook fails
# on_failure: fail_later # fail the whole spec, but allows remaining hooks for this table to be executed
# on_failure: warn_only # don't fail the whole spec, failure will be logged in a report
inline_table:
create_action:
# This table should be created with a SQL query inlined in the YAML file
sql_query:
query: SELECT * FROM some_table
incremental_by_time_table:
create_action:
# This table should be updated with incremental by time strategy. Please refer to the user manual for details
incremental_by_time:
source: some_query_with_cutoff.sql # SQL file location relative to sql_dir
timestamp_column: column_foo # Column that contains timestamp
start_day: 2021-01-01 # Starting date, used when table is created from scratch
lookback_days: 14 # Last N days slice to be updated during subsequent runts
vars: # Variables for Mustache templating, this program will implicitly provide "cutoff" variable
foo: bar
baz: boo
incremental_pk_table:
create_action:
# This table should be updated with incremental by time strategy. Please refer to the user manual for details
incremental_by_pk:
source: some_query_with_cutoff.sql # SQL file location relative to sql_dir
pk: [column_foo]
start_day: 2021-01-01
lookback_days: 14
custom_prog:
# This table should be created by a custom program
# since we don't provide any arguments to the program, we can use simplified syntax
create_action: ACME.custom_prog
custom_prog_with_args:
create_action:
# This table should be created by a custom program
# In this case we pass some arguments to the program
ACME.custom_prog:
some_args: true
experimental/tardis:
create_action:
experimental/tardis:
source: acme
primary_key:
- id
ignore_fields:
- last_seen_at
dry_run_fields:
- id
- name
table_options:
clustering:
- id
experimental/tardis_present_view:
create_action:
experimental/tardis_present_view:
source: experimental/tardis
experimental/tardis_last_revisions_view:
create_action:
experimental/tardis_last_revisions_view:
source: experimental/tardis
experimental/tardis_time_travel_view:
create_action:
experimental/tardis_time_travel_view:
source: experimental/tardis
when: "2022-02-22T22:22:22Z"
all_hooks:
create_action:
sql_file:
source: some_query.sql
# Example of all hooks available, can be used as pre_hooks as well
post_hooks:
- assert_all_values_within:
table: Invoice
column: InvoiceDate
type: date
from: 2009-01-01
to: 2013-12-31
# on_failure controls how the hook failures are handled, this attribute applies to all hooks
on_failure: fail_now # default, fail the whole spec as soon hook fails
# on_failure: fail_later # fail the whole spec, but allows remaining hooks for this table to be executed
# on_failure: warn_only # don't fail the whole spec, failure will be logged in a report
- assert_any_values_within:
table: Artist
column: Count
type: int
from: 1
to: 25
- assert_no_values_within:
table: popular_tracks_for_pink_floyd
column: Name
values:
- Sweet Home Alabama
- Brain Damage
- assert_all_values_within:
table: popular_tracks_for_pink_floyd
column: Name
values:
- Time
- The Great Gig In The Sky
- Any Colour You Like
- Brain Damage
- Sweet Home Alabama
- assert_unique:
table: popular_tracks_for_pink_floyd
columns: [Name]
- assert_count:
table: popular_tracks_for_pink_floyd
equal: 123
- assert_count:
table: popular_tracks_for_pink_floyd
greater_than_or_equal: popular_tracks_for_pink_floyd
- assert_count:
table: popular_tracks_for_pink_floyd
approximately_equal: 123
type: absolute
# type: relative
tolerance: 10 # we can be off by 10 and it's still ok
- assert_cardinalities:
table: popular_tracks_for_pink_floyd
column: name
equal:
table: Artist
column: name
- assert_expression:
table: popular_tracks_for_pink_floyd
expression: name
- assert_not_null:
table: popular_tracks_for_pink_floyd
columns:
- some_column
- expression: NULLIF (some_column, '')
# custom hook without arguments
- ACME.custom_hook
# custom hook with arguments
- ACME.custom_hook:
target: artists_hashes