Catalogs and Tables#

Note

Daft’s catalog and tables APIs are early in their development.

class Catalog[source]#

Interface for python catalog implementations.

static from_iceberg(catalog: object) Catalog[source]#

Creates a Daft Catalog instance from an Iceberg catalog.

Parameters:

catalog (object) – pyiceberg catalog object

Returns:

new daft catalog instance from the pyiceberg catalog object.

Return type:

Catalog

static from_pydict(tables: dict[str, object], name: str = 'default') Catalog[source]#

Returns an in-memory catalog from a dictionary of table-like objects.

The table-like objects can be pydicts, dataframes, or a Table implementation.

Examples

>>> import daft
>>> from daft.catalog import Catalog, Table
>>>
>>> dictionary = {"x": [1, 2, 3]}
>>> dataframe = daft.from_pydict(dictionary)
>>> table = Table.from_df("temp", dataframe)
>>>
>>> catalog = Catalog.from_pydict(
...     {
...         "R": dictionary,
...         "S": dataframe,
...         "T": table,
...     }
... )
>>> catalog.list_tables()
['R', 'S', 'T']
Parameters:

tables (dict[str,object]) – a dictionary of table-like objects (pydicts, dataframes, and tables)

Returns:

new catalog instance with name ‘default’

Return type:

Catalog

static from_s3tables(table_bucket_arn: str, client: Optional[object] = None, session: Optional[object] = None)[source]#

Creates a Daft Catalog from S3 Tables bucket ARN, with optional client or session.

If neither a boto3 client nor session is given, an Iceberg REST client is used.

Parameters:
  • table_bucket_arn (str) – s3tables bucket arn

  • client – optional boto3 client

  • session – optional boto3 session

Returns:

new daft catalog instance backed by S3 Tables.

Return type:

Catalog

static from_unity(catalog: object) Catalog[source]#

Creates a Daft Catalog instance from a Unity catalog.

Parameters:

catalog (object) – unity catalog object

Returns:

new daft catalog instance from the unity catalog object.

Return type:

Catalog

abstract get_table(identifier: daft.catalog.Identifier | str) Table[source]#

Get a table by its identifier or raises if the table does not exist.

Parameters:

identifier (Identifier|str) – table identifier

Returns:

matched table or raises if the table does not exist.

Return type:

Table

abstract list_namespaces(pattern: Optional[str] = None) list[daft.catalog.Identifier][source]#

List namespaces in the catalog which match the given pattern.

Parameters:

pattern (str) – pattern to match such as a namespace prefix

Returns:

list of namespace identifiers matching the pattern.

Return type:

list[Identifier]

abstract list_tables(pattern: Optional[str] = None) list[str][source]#

List tables in the catalog which match the given pattern.

Parameters:

pattern (str) – pattern to match such as a namespace prefix

Returns:

list of table identifiers matching the pattern.

Return type:

list[str]

load_table(name: str) Table[source]#

DEPRECATED: Please use get_table instead; version=0.5.0!

abstract property name: str#

Returns the catalog’s name.

read_table(identifier: daft.catalog.Identifier | str, **options) DataFrame[source]#

Returns the table as a DataFrame or raises an exception if it does not exist.

class Identifier(*parts: str)[source]#

A reference (path) to a catalog object.

Example

>>> id = Identifier("a", "b")
>>> assert len(id) == 2
drop(n: int = 1) Identifier[source]#

Returns a new Identifier with the first n parts removed.

Parameters:

n (int) – Number of parts to drop from the beginning. Defaults to 1.

Returns:

A new Identifier with the first n parts removed.

Return type:

Identifier

Raises:

ValueError – If dropping n parts would result in an empty Identifier.

static from_sql(input: str, normalize: bool = False) Identifier[source]#

Parses an Identifier from an SQL string, normalizing to lowercase if specified.

Example

>>> from daft.catalog import Identifier
>>> Identifier.from_sql("namespace.table") == Identifier("namespace", "table")
>>> Identifier.from_sql('"a.b"') == Identifier('"a.b."')
>>> Identifier.from_sql('ABC."xYz"', normalize=True) == Identifier("abc", "xYz")
Parameters:
  • input (str) – input sql string

  • normalize (bool) – flag to case-normalize the identifier text

Returns:

new identifier instance

Return type:

Identifier

static from_str(input: str) Identifier[source]#

Parses an Identifier from a dot-delimited Python string without normalization.

Example

>>> from daft.catalog import Identifier
>>> Identifier.from_str("namespace.table") == Identifier("namespace", "table")
Parameters:

input (str) – input identifier string

Returns:

new identifier instance

Return type:

Identifier

class Table[source]#

Interface for python table implementations.

append(df: DataFrame, **options) None[source]#

Appends the DataFrame to this table.

Parameters:
  • df (DataFrame) – dataframe to append

  • **options – additional format-dependent write options

static from_df(name: str, dataframe: DataFrame) Table[source]#

Returns a read-only table backed by the DataFrame.

Example

>>> import daft
>>> from daft.catalog import Table
>>> Table.from_df("my_table", daft.from_pydict({"x": [1, 2, 3]}))
Parameters:
  • name (str) – table name

  • dataframe (DataFrame) – table source dataframe

Returns:

new table instance

Return type:

Table

static from_iceberg(table: object) Table[source]#

Creates a Daft Table instance from an Iceberg table.

Parameters:

table (object) – a pyiceberg table

Returns:

new daft table instance

Return type:

Table

static from_pydict(name: str, data: dict[str, InputListType]) Table[source]#

Returns a read-only table backed by the given data.

Example

>>> from daft.catalog import Table
>>> table = Table.from_pydict({"foo": [1, 2]})
>>> table.show()
╭───────╮
│ foo   │
│ ---   │
│ Int64 │
╞═══════╡
│ 1     │
├╌╌╌╌╌╌╌┤
│ 2     │
╰───────╯

(Showing first 2 of 2 rows)
Parameters:
  • name (str) – table table

  • dict[str (data) – keys are column names and the values are python lists, numpy arrays, or arrow arrays.

  • object] – keys are column names and the values are python lists, numpy arrays, or arrow arrays.

Returns:

new read-only table instance

Return type:

DataFrame

static from_unity(table: object) Table[source]#

Returns a Daft Table instance from a Unity table.

Parameters:

table

abstract property name: str#

Returns the table’s name.

overwrite(df: DataFrame, **options) None[source]#

Overwrites this table with the given DataFrame.

Parameters:
  • df (DataFrame) – dataframe to overwrite this table with

  • **options – additional format-dependent write options

abstract read(**options) DataFrame[source]#

Creates a new DataFrame from this table.

Parameters:

**options – additional format-dependent read options

Returns:

new DataFrame instance

Return type:

DataFrame

select(*columns: ColumnInputType) DataFrame[source]#

Creates a new DataFrame from the table applying the provided expressions.

Parameters:

*columns (Expression|str) – columns to select from the current DataFrame

Returns:

new DataFrame instance with the select columns

Return type:

DataFrame

show(n: int = 8) None[source]#

Shows the first n rows from this table.

Parameters:

n (int) – number of rows to show

Returns:

None

to_dataframe() DataFrame[source]#

DEPRECATED: Please use read instead; version 0.5.0!

abstract write(df: DataFrame, mode: Literal['append', 'overwrite'] = 'append', **options) None[source]#

Writes the DataFrame to this table.

Parameters:
  • df (DataFrame) – datafram to write

  • mode (str) – write mode such as ‘append’ or ‘overwrite’

  • **options – additional format-dependent write options