# Enable YAML schema in IDE
# yaml-language-server: $schema=https://napkin-public-storage-bucket.s3.us-east-1.amazonaws.com/schema/schema.json
# We don't allow extra fields except for top-level fields that start with _.
# They may be useful when YAML anchors are used for code reuse.
_commonOptions: &common
some_common: options
## Application name
app_name: foo bar
## Metadata configuration
# Napkin maintains a database with the history of runs. It's used to estimate pipeline runtime as well as to enforce update strategies.
metadata_url: files:/home/user/.napkin/metadata # optional, by default metadata will be stored in files a hidden directory next to YAML file
# metadata_url: s3://bucket/some/prefix/ # metadata can be also stored in S3-compatible storage
#### Database connection configuration
## BigQuery
backend: BigQuery
db_url: bigquery://bigquery.googleapis.com/project-acme
backend_options: # backend specific options
labels: # can be used to segregate costs in Google Cloud billing dashboard
client: acme
repo: client-acme
concurrent_queries: 100
timeout: 300
## Postgres
# backend: Postgres
# db_url: postgresql://user@db/napkin_db
# backend_options: # backend specific options
# connection_pool: 100
## Sqlite
# backend: Sqlite
# db_url: sqlite:acme.db
# backend_options: {} # no backend options for Sqlite
## Location of SQL files when referenced in Spec.
sql_folder: sql # optional, default value is "../sql"
## SQL dialect to use when parsing queries
parser_dialect: napkin.bigquery
# napkin.* family of dialects will parse SQL then convert to Napkin intermediate representation.
# This enables Napkin optimization features, however some SQL features are not available.
# Queries parsed that way will be rendered accordingly to db backend syntax – this can be used to convert queries (e.g. parse as BigQuery, run with Postgres).
# parser_dialect: napkin.bigquery
# parser_dialect: napkin.postgres
# parser_dialect: napkin.sqlite
# parser_dialect: napkin.ansi2011
# generic.* family of dialects will parse SQL, but will not convert into Napkin intermediate representation.
# This provides is provides more compatibility with SQL features.
# Queries parsed that way will be rendered accordingly to db backend dialect – this can be used to convert queries (e.g. parse as BigQuery, run with Postgres).
# parser_dialect: generic.bigquery
# parser_dialect: generic.postgres
# parser_dialect: generic.sqlite
# parser_dialect: generic.ansi2011
# Use parser based on Postgres SQL grammar.
# Postgres-specific features (such as ->> JSON operator) can be used with in this mode.
# Queries parsed that way will be always rendered according to Postgres dialect.
# parser_dialect: postgres
# Consider queries as a plain text. Use this only if other fail.
# WARNING: dependency discovery as well as table renaming will not work with raw queries
# parser_dialect: raw
## Location of Haskell files when referenced in Spec.
haskell_folder: # allows to specify root folder(s), default value is "../haskell"
- ../../../haskell
## Meta-programming & scripting configuration
## List of Haskell modules that should be imported (in a qualified way) into the Mustache templating environment.
# You don't need to list modules used for table specs, hooks, preprocessors, etc. here.
haskell_modules: # optional, defaults to empty
- ACME.Mustache # Haskell module
## Custom validator
validator: ACME.validator # optional, Haskell symbol, defaults to none
## Haskell package definition
haskell_package:
hpack: ../../package.yaml # optional, location of the Hpack file to get dependencies and extension defaults from.
# cabal: ../../package-name.cabal
library: internal # optional, name of the internal library to extract information from.
## Haskell spec
# In addition to specifying tables in YAML, tables can be also specified in the Haskell program with Napkin DSL.
haskell_spec: ACME.spec # optional, Haskell symbol, defaults to none
## Default extensions for Haskell interpreter
haskell_default_language_extensions:
- OverlappingInstances
- NoOverloadedStrings # extensions enabled by default can be disabled as well
## List of extra packages to be made available in the Haskell scripting
# Note: these extensions need to be present in a package DB. Please consult the user manual for directions on how to add extra packages.
haskell_dependencies: # optional, defaults to empty
- fmt
- yaml
## Spec preprocessors
preprocessors: # defaults to empty
# Sets the default dataset for bigquery backend
- bigquery_defaults:
dataset: sample
# Prefix all tables (according to configuration) without hardcoding the prefix directly in the Spec and queries
- table_prefix:
# one or both of value and override_with_arg need to be specified
value: development # prefix
override_with_arg: environment # allow to override with CLI, e.g. --arg environment=production
separator: _ # defaults to empty,
# scope: unmanaged
# scope: all
# only: [foo, bar, baz] # apply renamer only to selected tables
# except: [foo, bar, baz] # apply renamer to all tables except selected tables
# Move all tables (according to configuration) to dataset/schema without specifying it directly in the Spec and queries
- table_namespace:
# one or both of value and override_with_arg need to be specified
value: development # namespace
override_with_arg: environment # allow to override with CLI, e.g. --arg environment=production
scope: managed
# scope: unmanaged
# scope: all
only: [foo, bar, baz] # apply renamer only to selected tables
except: [foo, bar, baz] # apply renamer to all tables except selected tables
on_existing: overwrite # default, replace namespace even if it was specified explicitly
# on_existing: keep_original # keep original namespace if it has been specified explicitly in the Spec
# - autodiscover # autodiscover sql files in sql_dir and add them as trivial tables
# Custom preprocessor with some arguments (Haskell symbol)
- ACME.preprocessor:
some: arguments
# Custom preprocessor without arguments (Haskell symbol)
- ACME.preprocessor
## Arguments are visible in mustache with `args.` prefix and available for the Haskell code as `SpecMetaArgs`.
arguments:
arg:
description: some description
default: default
## Tables that are unmanaged by Napkin may be explicitly imported,
## If this flag is used all unmanaged tables in spec must be declared or
## run/validation will fail.
external_tables: # Unmanaged table list
- acme
- Artist
- foo
- Invoice
- popular_tracks_for_pink_floyd
- some_dependency
- source_table
- table_to_be_copied
## Tables that are managed by Napkin
tables:
some_table: # Table name
description: Some description # optional, defaults to empty
notes:
column_name: Column description
# Defines a method the table will be created with
create_action:
# This table should be created with a SQL query stored in a file
sql_file:
source: some_query.sql # SQL file location relative to sql_dir
target_type:
table: # default
# Backend specific table options, optional
partitioning: # Big Query options
column: partitioning_column
interval: month
require_filter: false
expiration_seconds: 3600
# range:
# start: 1
# end: 100
# step: 2
write_disposition: append
clustering: [clustering_column]
# target_type: view # create view instead of table
# target_type: materialized_view # create view instead of table
# target_type: # backend specific materialized view options
# materialized_view:
# timescale:
# continuous:
# create_group_indexes: false
# materialized_only: false
# aggregate_policy:
# start_offset: 3 hours
# end_offset: 1 hour
# schedule_interval: 1 hour
# retention_policy: 1 year
vars: # Variables for Mustache templating, defaults to none
foo: bar
baz: boo
# What should trigger table being updated or recreated, optional, defaults to: always
update_strategy:
- type: always # Update table always.
- type: periodically # Update table if the period (in seconds) between last successful run and now is longer than designated period, or if it has no successful runs.
period: 3600 # Requires metadata to be persisted between Napkin runs. Will not run if table merely resulted in an error in last temporal run (use if_errored instead).
# period: # Alternatively period can be specified with an object specifying one or more of the following fields: seconds, minutes, days, weeks
# seconds: 20
# minutes: 2
# days: 1
# weeks: 3
- type: with_dependency # Update table when its last successful run is older than its dependency's last successful run, or the dependency will update as well during this run.
- type: if_missing # Update table when it does not exist.
- type: if_errored # Update if the tables last temporal run resulted in an error.
# List of tables that are not referenced in the query, but should be considered as a dependency when scheduling, optional, defaults to empty
# Typically, Napkin will discover all dependencies automatically, so it does not have to be used. You can double-check dependencies discovered with dump command
deps:
- some_dependency
# List of tables that are referenced in the query and should not be considered as a dependency when scheduling, optional, defaults to empty
# Typically, Napkin will discover all dependencies automatically, so it does not have to be used. You can double-check dependencies discovered with dump command
hidden_deps:
- ignored_dependency
# Parser dialect to use for parsing queries for that table. See comments on the global attribute.
parser_dialect: napkin.bigquery
# Table tags, can be used to selectively enable or disable table execution with CLI flags during development
tags: [foo, bar, baz]
## Pre-hooks can be used to validate input data
# See examples below what assertions are built-in
pre_hooks: # optional, defaults to empty
- assert_unique:
table: source_table
columns: [some_column]
## Post-hooks can be used to validate query results
# The same set of assertions can be used both in pre and post hooks
# See examples below what assertions are built-in
post_hooks: # optional, defaults to empty
- assert_unique:
table: some_table # defaults to current table being defined
columns: [result_column]
# on_failure controls how the hook failures are handled, this attribute applies to all hooks
on_failure: fail # default, fail the whole spec allowing other table's post hooks to finish
# on_failure: warn_only # don't fail the whole spec, failure will be logged in a report
inline_table:
create_action:
# This table should be created with a SQL query inlined in the YAML file
sql_query:
query: SELECT * FROM some_table
incremental_by_time_table:
create_action:
# This table should be updated with incremental by time strategy. Please refer to the user manual for details
incremental_by_time:
source: some_query_with_cutoff.sql # SQL file location relative to sql_dir
timestamp_column: column_foo # Column that contains timestamp
start_day: 2021-01-01 # Starting date, used when table is created from scratch
lookback_days: 14 # Last N days slice to be updated during subsequent runts
vars: # Variables for Mustache templating, this program will implicitly provide "cutoff" variable
foo: bar
baz: boo
incremental_pk_table:
create_action:
# This table should be updated with incremental by time strategy. Please refer to the user manual for details
incremental_by_pk:
source: some_query_with_cutoff.sql # SQL file location relative to sql_dir
pk: [column_foo]
start_day: 2021-01-01
lookback_days: 14
custom_prog:
# This table should be created by a custom program
# since we don't provide any arguments to the program, we can use simplified syntax
create_action: ACME.custom_prog
custom_prog_with_args:
create_action:
# This table should be created by a custom program
# In this case we pass some arguments to the program
ACME.custom_prog:
some_args: true
import_from_csv_file:
# Import a csv file
create_action:
csv_file:
source: file.csv # csv file to be imported
chunks: 500 # default, specifies chunks size
header: true # header is present or not
if_exists: recreate # default, recreate table
# if_exists: append # keep existing table and append to it
# if_exists: truncate # keep existing table but drop its previous content
csv_separator: ',' # default separator character to be used in between fields
csv_quote_char: '"' # default quote character that may sometimes be present around fields
columns: # columns specification
- field1: integer # type :- integer | double | date | date_time | text
- field2: date
copy_table:
create_action:
# This table should be created by copying an existing `table_to_be_copied` table
copy:
table: table_to_be_copied # table to be copied
if_exists: recreate # default, drop `copy_table` table if exists and recreate it
# if_exists: append # keep existing `copy_table`, and append to it
# if_exists: truncate # keep existing `copy_table` but drop its previous content
experimental/tardis:
create_action:
experimental/tardis:
source: acme
primary_key:
- id
ignore_fields:
- last_seen_at
dry_run_fields:
- id
- name
table_options:
clustering:
- id
experimental/tardis_present_view:
create_action:
experimental/tardis_present_view:
source: experimental/tardis
experimental/tardis_last_revisions_view:
create_action:
experimental/tardis_last_revisions_view:
source: experimental/tardis
experimental/tardis_time_travel_view:
create_action:
experimental/tardis_time_travel_view:
source: experimental/tardis
when: "2022-02-22T22:22:22Z"
all_hooks:
create_action:
sql_file:
source: some_query.sql
# Example of all hooks available, can be used as pre_hooks as well
post_hooks:
- assert_all_values_within:
table: Invoice
column: InvoiceDate
type: date
from: 2009-01-01
to: 2013-12-31
# on_failure controls how the hook failures are handled, this attribute applies to all hooks
on_failure: fail # default, fail the whole spec allowing other table's post hooks to finish
# on_failure: warn_only # don't fail the whole spec, failure will be logged in a report
- assert_any_values_within:
table: Artist
column: Count
type: int
from: 1
to: 25
- assert_no_values_within:
table: popular_tracks_for_pink_floyd
column: Name
values:
- Sweet Home Alabama
- Brain Damage
- assert_all_values_within:
table: popular_tracks_for_pink_floyd
column: Name
values:
- Time
- The Great Gig In The Sky
- Any Colour You Like
- Brain Damage
- Sweet Home Alabama
- assert_unique:
table: popular_tracks_for_pink_floyd
columns: [Name]
- assert_count:
table: popular_tracks_for_pink_floyd
equal: 123
- assert_count:
table: popular_tracks_for_pink_floyd
greater_than_or_equal: popular_tracks_for_pink_floyd
- assert_count:
table: popular_tracks_for_pink_floyd
approximately_equal: 123
type: absolute
# type: relative
tolerance: 10 # we can be off by 10 and it's still ok
- assert_cardinalities:
table: popular_tracks_for_pink_floyd
column: name
equal:
Artist:
- name
- assert_expression:
table: popular_tracks_for_pink_floyd
expression: name
- assert_not_null:
table: popular_tracks_for_pink_floyd
columns:
- some_column
- expression: NULLIF (some_column, '')
# custom hook without arguments
- ACME.custom_hook
# custom hook with arguments
- ACME.custom_hook:
table: artists_hashes