Link Search Menu Expand Document

Spec YAML reference

General reference

# Enable YAML schema in IDE
# yaml-language-server: $schema=https://napkin-public-storage-bucket.s3.us-east-1.amazonaws.com/schema/schema.json

# We don't allow extra fields except for top-level fields that start with _.
# They may be useful when YAML anchors are used for code reuse.
_commonOptions: &common
  some_common: options

## Application name
app_name: foo bar

## Metadata configuration
# Napkin maintains a database with the history of runs. It's used to estimate pipeline runtime as well as to enforce update strategies.
metadata_url: files:/home/user/.napkin/metadata # optional, by default metadata will be stored in files a hidden directory next to YAML file
# metadata_url: s3://bucket/some/prefix/ # metadata can be also stored in S3-compatible storage

#### Database connection configuration

## BigQuery
backend: BigQuery
db_url: bigquery://bigquery.googleapis.com/project-acme
backend_options: # backend specific options
  labels: # can be used to segregate costs in Google Cloud billing dashboard
    client: acme
    repo: client-acme
  concurrent_queries: 100
  http_timeout: 300
  job_timeout: 600

## Postgres
# backend: Postgres
# db_url: postgresql://user@db/napkin_db
# backend_options: # backend specific options
#   connection_pool: 100

## Sqlite
# backend: Sqlite
# db_url: sqlite:acme.db
# backend_options: {} # no backend options for Sqlite

## Location of SQL files when referenced in Spec.
sql_folder: sql # optional, default value is "../sql"

## SQL dialect to use when parsing queries
parser_dialect: napkin.bigquery
# napkin.* family of dialects will parse SQL then convert to Napkin intermediate representation.
# This enables Napkin optimization features, however some SQL features are not available.
# Queries parsed that way will be rendered accordingly to db backend syntax – this can be used to convert queries (e.g. parse as BigQuery, run with Postgres).
  # parser_dialect: napkin.bigquery
  # parser_dialect: napkin.postgres
  # parser_dialect: napkin.sqlite
  # parser_dialect: napkin.ansi2011
# generic.* family of dialects will parse SQL, but will not convert into Napkin intermediate representation.
# This provides is provides more compatibility with SQL features.
# Queries parsed that way will be rendered accordingly to db backend dialect – this can be used to convert queries (e.g. parse as BigQuery, run with Postgres).
  # parser_dialect: generic.bigquery
  # parser_dialect: generic.postgres
  # parser_dialect: generic.sqlite
  # parser_dialect: generic.ansi2011
# Use parser based on Postgres SQL grammar.
# Postgres-specific features (such as ->> JSON operator) can be used with in this mode.
# Queries parsed that way will be always rendered according to Postgres dialect.
  # parser_dialect: postgres
# Consider queries as a plain text. Use this only if other fail.
# WARNING: dependency discovery as well as table renaming will not work with raw queries
  # parser_dialect: raw


## Location of Haskell files when referenced in Spec.
haskell_folder: # allows to specify root folder(s), default value is "../haskell"
  - ../../../haskell

## Meta-programming & scripting configuration

## List of Haskell modules that should be imported (in a qualified way) into the Mustache templating environment.
# You don't need to list modules used for table specs, hooks, preprocessors, etc. here.
haskell_modules: # optional, defaults to empty
  - ACME.Mustache # Haskell module

## Custom validator
validator: ACME.validator # optional, Haskell symbol, defaults to none

## Haskell package definition
haskell_package:
  hpack: ../../package.yaml # optional, location of the Hpack file to get dependencies and extension defaults from.
  # cabal: ../../package-name.cabal
  library: internal # optional, name of the internal library to extract information from.

## Haskell spec
# In addition to specifying tables in YAML, tables can be also specified in the Haskell program with Napkin DSL.
haskell_spec: ACME.spec # optional, Haskell symbol, defaults to none

## Default extensions for Haskell interpreter
haskell_default_language_extensions:
  - OverlappingInstances
  - NoOverloadedStrings # extensions enabled by default can be disabled as well

## List of extra packages to be made available in the Haskell scripting
# Note: these extensions need to be present in a package DB. Please consult the user manual for directions on how to add extra packages.
haskell_dependencies: # optional, defaults to empty
  - fmt
  - yaml

## Spec preprocessors
preprocessors: # defaults to empty
  # Sets the default dataset for bigquery backend
  - bigquery_defaults:
      dataset: sample

  # Prefix all tables (according to configuration) without hardcoding the prefix directly in the Spec and queries
  - table_prefix:
      # one or both of value and override_with_arg need to be specified
      value: development # prefix
      override_with_arg: environment # allow to override with CLI, e.g. --arg environment=production
      separator: _ # defaults to empty,
      # scope: unmanaged
      # scope: all
      # only: [foo, bar, baz] # apply renamer only to selected tables
      # except: [foo, bar, baz] # apply renamer to all tables except selected tables

  # Move all tables (according to configuration) to dataset/schema without specifying it directly in the Spec and queries
  - table_namespace:
      # one or both of value and override_with_arg need to be specified
      value: development # namespace
      override_with_arg: environment # allow to override with CLI, e.g. --arg environment=production
      scope: managed
      # scope: unmanaged
      # scope: all
      only: [foo, bar, baz] # apply renamer only to selected tables
      except: [foo, bar, baz] # apply renamer to all tables except selected tables
      on_existing: overwrite # default, replace namespace even if it was specified explicitly
      # on_existing: keep_original # keep original namespace if it has been specified explicitly in the Spec

  # - autodiscover # autodiscover sql files in sql_dir and add them as trivial tables

  # Custom preprocessor with some arguments (Haskell symbol)
  - ACME.preprocessor:
      some: arguments

  # Custom preprocessor without arguments (Haskell symbol)
  - ACME.preprocessor

## Arguments are visible in mustache with `args.` prefix and available for the Haskell code as `SpecMetaArgs`.
arguments:
  arg:
    description: some description
    default: default

## Tables that are unmanaged by Napkin may be explicitly imported,
## If this flag is used all unmanaged tables in spec must be declared or
## run/validation will fail.
external_tables: # Unmanaged table list
  - acme
  - Artist
  - foo
  - Invoice
  - popular_tracks_for_pink_floyd
  - some_dependency
  - source_table
  - table_to_be_copied

## Tables that are managed by Napkin
tables:
  some_table: # Table name
    description: Some description # optional, defaults to empty
    notes:
      column_name: Column description
    # Defines a method the table will be created with
    create_action:
      # This table should be created with a SQL query stored in a file
      sql_file:
        source: some_query.sql # SQL file location relative to sql_dir
        target_type:
          table: # default
          # Backend specific table options, optional
            partitioning: # Big Query options
              column: partitioning_column
              interval: month
              require_filter: false
              expiration_seconds: 3600
              # range:
              #   start: 1
              #   end: 100
              #   step: 2
            write_disposition: append
            clustering: [clustering_column]
        # target_type: view # create view instead of table
        # target_type: materialized_view # create view instead of table

        # target_type: # backend specific materialized view options
        #   materialized_view:
        #     timescale:
        #       continuous:
        #         create_group_indexes: false
        #         materialized_only: false
        #         aggregate_policy:
        #           start_offset: 3 hours
        #           end_offset: 1 hour
        #           schedule_interval: 1 hour
        #         retention_policy: 1 year
        vars: # Variables for Mustache templating, defaults to none
          foo: bar
          baz: boo

    # What should trigger table being updated or recreated, optional, defaults to: always
    update_strategy:
      - type: always # Update table always.
      - type: periodically # Update table if the period (in seconds) between last successful run and now is longer than designated period, or if it has no successful runs.
        period: 3600       # Requires metadata to be persisted between Napkin runs.  Will not run if table merely resulted in an error in last temporal run (use if_errored instead).
        # period:          # Alternatively period can be specified with an object specifying one or more of the following fields: seconds, minutes, days, weeks
        #   seconds: 20
        #   minutes: 2
        #   days: 1
        #   weeks: 3
      - type: with_dependency # Update table when its last successful run is older than its dependency's last successful run, or the dependency will update as well during this run.
      - type: if_missing # Update table when it does not exist.
      - type: if_errored # Update if the tables last temporal run resulted in an error.

    # List of tables that are not referenced in the query, but should be considered as a dependency when scheduling, optional, defaults to empty
    # Typically, Napkin will discover all dependencies automatically, so it does not have to be used. You can double-check dependencies discovered with dump command
    deps:
      - some_dependency

    # List of tables that are referenced in the query and should not be considered as a dependency when scheduling, optional, defaults to empty
    # Typically, Napkin will discover all dependencies automatically, so it does not have to be used. You can double-check dependencies discovered with dump command
    hidden_deps:
      - ignored_dependency

    # Parser dialect to use for parsing queries for that table. See comments on the global attribute.
    parser_dialect: napkin.bigquery

    # Table tags, can be used to selectively enable or disable table execution with CLI flags during development
    tags: [foo, bar, baz]

    ## Pre-hooks can be used to validate input data
    # See examples below what assertions are built-in
    pre_hooks: # optional, defaults to empty
      - assert_unique:
          table: source_table
          columns: [some_column]

    ## Post-hooks can be used to validate query results
    # The same set of assertions can be used both in pre and post hooks
    # See examples below what assertions are built-in
    post_hooks: # optional, defaults to empty
      - assert_unique:
          table: some_table # defaults to current table being defined
          columns: [result_column]
          # on_failure controls how the hook failures are handled, this attribute applies to all hooks
          on_failure: fail # default, fail the whole spec allowing other table's post hooks to finish
          # on_failure: warn_only # don't fail the whole spec, failure will be logged in a report

  inline_table:
    create_action:
      # This table should be created with a SQL query inlined in the YAML file
      sql_query:
        query: SELECT * FROM some_table

  incremental_by_time_table:
    create_action:
      # This table should be updated with incremental by time strategy. Please refer to the user manual for details
      incremental_by_time:
        source: some_query_with_cutoff.sql # SQL file location relative to sql_dir
        timestamp_column: column_foo # Column that contains timestamp
        start_day: 2021-01-01 # Starting date, used when table is created from scratch
        lookback_days: 14 # Last N days slice to be updated during subsequent runts
        vars: # Variables for Mustache templating, this program will implicitly provide "cutoff" variable
          foo: bar
          baz: boo

  incremental_pk_table:
    create_action:
    # This table should be updated with incremental by time strategy. Please refer to the user manual for details
      incremental_by_pk:
        source: some_query_with_cutoff.sql # SQL file location relative to sql_dir
        pk: [column_foo]
        start_day: 2021-01-01
        lookback_days: 14

  custom_prog:
    # This table should be created by a custom program
    # since we don't provide any arguments to the program, we can use simplified syntax
    create_action: ACME.custom_prog

  custom_prog_with_args:
    create_action:
      # This table should be created by a custom program
      # In this case we pass some arguments to the program
      ACME.custom_prog:
        some_args: true

  import_from_csv_file:
    # Import a csv file
    create_action:
      csv_file:
        source: file.csv # csv file to be imported
        chunks: 500 # default, specifies chunks size
        header: true # header is present or not
        if_exists: recreate # default, recreate table
        # if_exists: append # keep existing table and append to it
        # if_exists: truncate # keep existing table but drop its previous content
        csv_separator: ',' # default separator character to be used in between fields
        csv_quote_char: '"' # default quote character that may sometimes be present around fields
        columns: # columns specification
          - field1: integer # type :- integer | double | date | date_time | text
          - field2: date

  copy_table:
    create_action:
      # This table should be created by copying an existing `table_to_be_copied` table
      copy:
        table: table_to_be_copied # table to be copied
        if_exists: recreate # default, drop `copy_table` table if exists and recreate it
        # if_exists: append # keep existing `copy_table`, and append to it
        # if_exists: truncate # keep existing `copy_table` but drop its previous content

  experimental/tardis:
    create_action:
      experimental/tardis:
        source: acme
        primary_key:
          - id
        ignore_fields:
          - last_seen_at
        dry_run_fields:
          - id
          - name
        table_options:
          clustering:
            - id
  experimental/tardis_present_view:
    create_action:
      experimental/tardis_present_view:
        source: experimental/tardis
  experimental/tardis_last_revisions_view:
    create_action:
      experimental/tardis_last_revisions_view:
        source: experimental/tardis
  experimental/tardis_time_travel_view:
    create_action:
      experimental/tardis_time_travel_view:
        source: experimental/tardis
        when: "2022-02-22T22:22:22Z"

  all_hooks:
    create_action:
      sql_file:
        source: some_query.sql

    # Example of all hooks available, can be used as pre_hooks as well
    post_hooks:
      - assert_all_values_within:
          table: Invoice
          column: InvoiceDate
          type: date
          from: 2009-01-01
          to: 2013-12-31

          # on_failure controls how the hook failures are handled, this attribute applies to all hooks
          on_failure: fail # default, fail the whole spec allowing other table's post hooks to finish
          # on_failure: warn_only # don't fail the whole spec, failure will be logged in a report

      - assert_any_values_within:
          table: Artist
          column: Count
          type: int
          from: 1
          to: 25

      - assert_no_values_within:
          table: popular_tracks_for_pink_floyd
          column: Name
          values:
            - Sweet Home Alabama
            - Brain Damage

      - assert_all_values_within:
          table: popular_tracks_for_pink_floyd
          column: Name
          values:
            - Time
            - The Great Gig In The Sky
            - Any Colour You Like
            - Brain Damage
            - Sweet Home Alabama

      - assert_unique:
          table: popular_tracks_for_pink_floyd
          columns: [Name]

      - assert_count:
          table: popular_tracks_for_pink_floyd
          equal: 123

      - assert_count:
          table: popular_tracks_for_pink_floyd
          greater_than_or_equal: popular_tracks_for_pink_floyd

      - assert_count:
          table: popular_tracks_for_pink_floyd
          approximately_equal: 123
          type: absolute
          # type: relative
          tolerance: 10 # we can be off by 10 and it's still ok

      - assert_cardinalities:
          table: popular_tracks_for_pink_floyd
          column: name
          equal:
            Artist:
              - name

      - assert_expression:
          table: popular_tracks_for_pink_floyd
          expression: name

      - assert_not_null:
          table: popular_tracks_for_pink_floyd
          columns:
            - some_column
            - expression: NULLIF (some_column, '')

      # custom hook without arguments
      - ACME.custom_hook

      # custom hook with arguments
      - ACME.custom_hook:
          table: artists_hashes

Backend-specific options

BigQuery

# yaml-language-server: $schema=https://napkin-public-storage-bucket.s3.us-east-1.amazonaws.com/schema/schema.json

backend: BigQuery
db_url: bigquery://bigquery.googleapis.com/project-acme
sql_folder: sql
backend_options: # BigQuery-specific options
  labels: # can be used to segregate costs in Google Cloud billing dashboard
    client: acme
    repo: client-acme
  concurrent_queries: 100
  http_timeout: 300
  job_timeout: 600

preprocessors:
  - bigquery_defaults:
      dataset: acme_analytics

tables:
  some_table:
    create_action:
      sql_file:
        source: [some_query.sql] # SQL file location relative to sql_dir
        target_type:
          table: # BigQuery-specific table options
            partitioning: # Big Query options
              column: partitioning_column
              range:
                start: 1
                end: 100
                step: 2
            write_disposition: append
            clustering: [clustering_column]

  some_view:
    create_action:
      sql_file:
        source: [some_query.sql] # SQL file location relative to sql_dir
        target_type:
          view: # BigQuery-specific view options
            authorized_datasets: # Napkin runner has to have `bigquery.datasets.update` permission for each of datasets
              - dataset_id: acme_dataset
              - project_id: project-ecma
                dataset_id: ecma_dataset
            expiration_timestamp: 2024-01-30 20:00:00Z

  some_materialized_view:
    create_action:
      sql_file:
        source: [some_query.sql] # SQL file location relative to sql_dir
        target_type:
          materialized_view: # BigQuery-specific materialized view options
            authorized_datasets: # Napkin runner has to have `bigquery.datasets.update` permission for each of datasets
              - dataset_id: acme_dataset
              - project_id: project-ecma
                dataset_id: ecma_dataset
            clustering: [clustering_column]
            partitioning:
              column: partitioning_column
              range:
                start: 1
                end: 100
                step: 2
            refresh:
              minutes: 15

Postgres

# yaml-language-server: $schema=https://napkin-public-storage-bucket.s3.us-east-1.amazonaws.com/schema/schema.json

backend: Postgres
db_url: postgresql://user@db/napkin_db
sql_folder: sql
backend_options: # Postgres specific options
  connection_pool: 100

tables:
  timescale_continuous_aggregate:
    create_action:
      sql_file:
        source: [some_query.sql, some_query2.sql]
        target_type: # backend specific materialized view options
          materialized_view: # Postgres-specific options
            timescale:
              continuous: # Allows to create TimescaleDB continuous aggregate
                create_group_indexes: false # optional
                materialized_only: false # optional
                aggregate_policy: # optional
                  start_offset: 3 hours
                  end_offset: 1 hour
                  schedule_interval: 1 hour
                retention_policy: 1 year # optional

  indexes_for_table:
    create_action:
      sql_file:
        source: some_query.sql
        target_type:
          table:
            indexes: # add indexes to the table after it's created
              - a ASC, b DESC, c ASC NULLS FIRST # columns/expressions used in index
              - name: foo # index name, optional
                columns: a ASC, b DESC, c ASC NULLS FIRST  # columns/expressions used in index
                unique: true # optional, creates unique index
                concurrent: true # optional, index will be created concurrently
                where: d IS NOT NULL # optional, allows to create partial index

  indexes_for_materialized_view:
    create_action:
      sql_file:
        source: some_query.sql
        target_type:
          materialized_view:
            indexes: # add indexes to the table after it's created
              - a ASC, b DESC, c ASC NULLS FIRST # columns/expressions used in index
              - name: foo # index name, optional
                columns: a ASC, b DESC, c ASC NULLS FIRST  # columns/expressions used in index
                unique: true # optional, creates unique index
                concurrent: true # optional, index will be created concurrently
                where: d IS NOT NULL # optional, allows to create partial index

Redshift

# yaml-language-server: $schema=https://napkin-public-storage-bucket.s3.us-east-1.amazonaws.com/schema/schema.json

backend: Redshift
db_url: postgresql://user@db/napkin_db
sql_folder: sql
backend_options: # Redshift-specific options
  connection_pool: 100

tables:
  some_table:
    create_action:
      sql_file:
        source: some_query.sql
        target_type:
          table: # Redshift-specific table options
            dist_style: even
            local: true
            sorting:
              keys:
                - foo
              style: interleaved
            temp: false