Skip to content

Configuration#

Configure the execution backend, Daft in various ways during execution, and how Daft interacts with storage.

Setting the Runner#

Control the execution backend that Daft will run on by calling these functions once at the start of your application.

set_runner_native #

set_runner_native(
    num_threads: int | None = None,
) -> DaftContext

Configure Daft to execute dataframes using native multi-threaded processing.

This is the default execution mode for Daft.

Returns:

Name Type Description
DaftContext DaftContext

Updated Daft execution context configured for native execution.

Note

Can also be configured via environment variable: DAFT_RUNNER=native

Source code in daft/context.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def set_runner_native(num_threads: int | None = None) -> DaftContext:
    """Configure Daft to execute dataframes using native multi-threaded processing.

    This is the default execution mode for Daft.

    Returns:
        DaftContext: Updated Daft execution context configured for native execution.

    Note:
        Can also be configured via environment variable: DAFT_RUNNER=native
    """
    py_ctx = _set_runner_native(num_threads=num_threads)

    return DaftContext._from_native(py_ctx)

set_runner_ray #

set_runner_ray(
    address: str | None = None,
    noop_if_initialized: bool = False,
    max_task_backlog: int | None = None,
    force_client_mode: bool = False,
) -> DaftContext

Configure Daft to execute dataframes using the Ray distributed computing framework.

Parameters:

Name Type Description Default
address str | None

Ray cluster address to connect to. If None, connects to or starts a local Ray instance.

None
noop_if_initialized bool

If True, skip initialization if Ray is already running.

False
max_task_backlog int | None

Maximum number of tasks that can be queued. None means Daft will automatically determine a good default.

None
force_client_mode bool

If True, forces Ray to run in client mode.

False

Returns:

Name Type Description
DaftContext DaftContext

Updated Daft execution context configured for Ray.

Note

Can also be configured via environment variable: DAFT_RUNNER=ray

Source code in daft/context.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def set_runner_ray(
    address: str | None = None,
    noop_if_initialized: bool = False,
    max_task_backlog: int | None = None,
    force_client_mode: bool = False,
) -> DaftContext:
    """Configure Daft to execute dataframes using the Ray distributed computing framework.

    Args:
        address: Ray cluster address to connect to. If None, connects to or starts a local Ray instance.
        noop_if_initialized: If True, skip initialization if Ray is already running.
        max_task_backlog: Maximum number of tasks that can be queued. None means Daft will automatically determine a good default.
        force_client_mode: If True, forces Ray to run in client mode.

    Returns:
        DaftContext: Updated Daft execution context configured for Ray.

    Note:
        Can also be configured via environment variable: DAFT_RUNNER=ray
    """
    py_ctx = _set_runner_ray(
        address=address,
        noop_if_initialized=noop_if_initialized,
        max_task_backlog=max_task_backlog,
        force_client_mode=force_client_mode,
    )

    return DaftContext._from_native(py_ctx)

Setting Configurations#

Configure Daft in various ways during execution.

set_planning_config #

set_planning_config(
    config: PyDaftPlanningConfig | None = None,
    default_io_config: IOConfig | None = None,
) -> DaftContext

Globally sets various configuration parameters which control Daft plan construction behavior.

These configuration values are used when a Dataframe is being constructed (e.g. calls to create a Dataframe, or to build on an existing Dataframe).

Parameters:

Name Type Description Default
config PyDaftPlanningConfig | None

A PyDaftPlanningConfig object to set the config to, before applying other kwargs. Defaults to None which indicates that the old (current) config should be used.

None
default_io_config IOConfig | None

A default IOConfig to use in the absence of one being explicitly passed into any Expression (e.g. .url.download()) or Dataframe operation (e.g. daft.read_parquet()).

None
Source code in daft/context.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def set_planning_config(
    config: PyDaftPlanningConfig | None = None,
    default_io_config: IOConfig | None = None,
) -> DaftContext:
    """Globally sets various configuration parameters which control Daft plan construction behavior.

    These configuration values are used when a Dataframe is being constructed (e.g. calls to create a Dataframe, or to build on an existing Dataframe).

    Args:
        config: A PyDaftPlanningConfig object to set the config to, before applying other kwargs. Defaults to None which indicates
            that the old (current) config should be used.
        default_io_config: A default IOConfig to use in the absence of one being explicitly passed into any Expression (e.g. `.url.download()`)
            or Dataframe operation (e.g. `daft.read_parquet()`).
    """
    # Replace values in the DaftPlanningConfig with user-specified overrides
    ctx = get_context()
    with ctx._lock:
        old_daft_planning_config = ctx._ctx._daft_planning_config if config is None else config
        new_daft_planning_config = old_daft_planning_config.with_config_values(
            default_io_config=default_io_config,
        )

        ctx._ctx._daft_planning_config = new_daft_planning_config
        return ctx

planning_config_ctx #

planning_config_ctx(**kwargs)

Context manager that wraps set_planning_config to reset the config to its original setting afternwards.

Source code in daft/context.py
135
136
137
138
139
140
141
142
143
@contextlib.contextmanager
def planning_config_ctx(**kwargs):
    """Context manager that wraps set_planning_config to reset the config to its original setting afternwards."""
    original_config = get_context().daft_planning_config
    try:
        set_planning_config(**kwargs)
        yield
    finally:
        set_planning_config(config=original_config)

set_execution_config #

set_execution_config(
    config: PyDaftExecutionConfig | None = None,
    scan_tasks_min_size_bytes: int | None = None,
    scan_tasks_max_size_bytes: int | None = None,
    max_sources_per_scan_task: int | None = None,
    broadcast_join_size_bytes_threshold: int | None = None,
    parquet_split_row_groups_max_files: int | None = None,
    sort_merge_join_sort_with_aligned_boundaries: bool
    | None = None,
    hash_join_partition_size_leniency: float | None = None,
    sample_size_for_sort: int | None = None,
    num_preview_rows: int | None = None,
    parquet_target_filesize: int | None = None,
    parquet_target_row_group_size: int | None = None,
    parquet_inflation_factor: float | None = None,
    csv_target_filesize: int | None = None,
    csv_inflation_factor: float | None = None,
    shuffle_aggregation_default_partitions: int
    | None = None,
    partial_aggregation_threshold: int | None = None,
    high_cardinality_aggregation_threshold: float
    | None = None,
    read_sql_partition_size_bytes: int | None = None,
    enable_aqe: bool | None = None,
    enable_native_executor: bool | None = None,
    default_morsel_size: int | None = None,
    shuffle_algorithm: str | None = None,
    pre_shuffle_merge_threshold: int | None = None,
    flight_shuffle_dirs: list[str] | None = None,
    enable_ray_tracing: bool | None = None,
    scantask_splitting_level: int | None = None,
    flotilla: bool | None = None,
) -> DaftContext

Globally sets various configuration parameters which control various aspects of Daft execution.

These configuration values are used when a Dataframe is executed (e.g. calls to DataFrame.write_*, DataFrame.collect() or DataFrame.show()).

Parameters:

Name Type Description Default
config PyDaftExecutionConfig | None

A PyDaftExecutionConfig object to set the config to, before applying other kwargs. Defaults to None which indicates that the old (current) config should be used.

None
scan_tasks_min_size_bytes int | None

Minimum size in bytes when merging ScanTasks when reading files from storage. Increasing this value will make Daft perform more merging of files into a single partition before yielding, which leads to bigger but fewer partitions. (Defaults to 96 MiB)

None
scan_tasks_max_size_bytes int | None

Maximum size in bytes when merging ScanTasks when reading files from storage. Increasing this value will increase the upper bound of the size of merged ScanTasks, which leads to bigger but fewer partitions. (Defaults to 384 MiB)

None
max_sources_per_scan_task int | None

Maximum number of sources in a single ScanTask. (Defaults to 10)

None
broadcast_join_size_bytes_threshold int | None

If one side of a join is smaller than this threshold, a broadcast join will be used. Default is 10 MiB.

None
parquet_split_row_groups_max_files int | None

Maximum number of files to read in which the row group splitting should happen. (Defaults to 10)

None
sort_merge_join_sort_with_aligned_boundaries bool | None

Whether to use a specialized algorithm for sorting both sides of a sort-merge join such that they have aligned boundaries. This can lead to a faster merge-join at the cost of more skewed sorted join inputs, increasing the risk of OOMs.

None
hash_join_partition_size_leniency float | None

If the left side of a hash join is already correctly partitioned and the right side isn't, and the ratio between the left and right size is at least this value, then the right side is repartitioned to have an equal number of partitions as the left. Defaults to 0.5.

None
sample_size_for_sort int | None

number of elements to sample from each partition when running sort, Default is 20.

None
num_preview_rows int | None

number of rows to when showing a dataframe preview, Default is 8.

None
parquet_target_filesize int | None

Target File Size when writing out Parquet Files. Defaults to 512MB

None
parquet_target_row_group_size int | None

Target Row Group Size when writing out Parquet Files. Defaults to 128MB

None
parquet_inflation_factor float | None

Inflation Factor of parquet files (In-Memory-Size / File-Size) ratio. Defaults to 3.0

None
csv_target_filesize int | None

Target File Size when writing out CSV Files. Defaults to 512MB

None
csv_inflation_factor float | None

Inflation Factor of CSV files (In-Memory-Size / File-Size) ratio. Defaults to 0.5

None
shuffle_aggregation_default_partitions int | None

Maximum number of partitions to create when performing aggregations on the Ray Runner. Defaults to 200, unless the number of input partitions is less than 200.

None
partial_aggregation_threshold int | None

Threshold for performing partial aggregations on the Native Runner. Defaults to 10000 rows.

None
high_cardinality_aggregation_threshold float | None

Threshold selectivity for performing high cardinality aggregations on the Native Runner. Defaults to 0.8.

None
read_sql_partition_size_bytes int | None

Target size of partition when reading from SQL databases. Defaults to 512MB

None
enable_aqe bool | None

Enables Adaptive Query Execution, Defaults to False

None
enable_native_executor bool | None

Enables the native executor, Defaults to False

None
default_morsel_size int | None

Default size of morsels used for the new local executor. Defaults to 131072 rows.

None
shuffle_algorithm str | None

The shuffle algorithm to use. Defaults to "auto", which will let Daft determine the algorithm. Options are "map_reduce" and "pre_shuffle_merge".

None
pre_shuffle_merge_threshold int | None

Memory threshold in bytes for pre-shuffle merge. Defaults to 1GB

None
flight_shuffle_dirs list[str] | None

The directories to use for flight shuffle. Defaults to ["/tmp"].

None
enable_ray_tracing bool | None

Enable tracing for Ray. Accessible in /tmp/ray/session_latest/logs/daft after the run completes. Defaults to False.

None
scantask_splitting_level int | None

How aggressively to split scan tasks. Setting this to 2 will use a more aggressive ScanTask splitting algorithm which might be more expensive to run but results in more even splits of partitions. Defaults to 1.

None
Source code in daft/context.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def set_execution_config(
    config: PyDaftExecutionConfig | None = None,
    scan_tasks_min_size_bytes: int | None = None,
    scan_tasks_max_size_bytes: int | None = None,
    max_sources_per_scan_task: int | None = None,
    broadcast_join_size_bytes_threshold: int | None = None,
    parquet_split_row_groups_max_files: int | None = None,
    sort_merge_join_sort_with_aligned_boundaries: bool | None = None,
    hash_join_partition_size_leniency: float | None = None,
    sample_size_for_sort: int | None = None,
    num_preview_rows: int | None = None,
    parquet_target_filesize: int | None = None,
    parquet_target_row_group_size: int | None = None,
    parquet_inflation_factor: float | None = None,
    csv_target_filesize: int | None = None,
    csv_inflation_factor: float | None = None,
    shuffle_aggregation_default_partitions: int | None = None,
    partial_aggregation_threshold: int | None = None,
    high_cardinality_aggregation_threshold: float | None = None,
    read_sql_partition_size_bytes: int | None = None,
    enable_aqe: bool | None = None,
    enable_native_executor: bool | None = None,
    default_morsel_size: int | None = None,
    shuffle_algorithm: str | None = None,
    pre_shuffle_merge_threshold: int | None = None,
    flight_shuffle_dirs: list[str] | None = None,
    enable_ray_tracing: bool | None = None,
    scantask_splitting_level: int | None = None,
    flotilla: bool | None = None,
) -> DaftContext:
    """Globally sets various configuration parameters which control various aspects of Daft execution.

    These configuration values
    are used when a Dataframe is executed (e.g. calls to `DataFrame.write_*`, [DataFrame.collect()](https://www.getdaft.io/projects/docs/en/stable/api/dataframe/#daft.DataFrame.collect) or [DataFrame.show()](https://www.getdaft.io/projects/docs/en/stable/api/dataframe/#daft.DataFrame.select)).

    Args:
        config: A PyDaftExecutionConfig object to set the config to, before applying other kwargs. Defaults to None which indicates
            that the old (current) config should be used.
        scan_tasks_min_size_bytes: Minimum size in bytes when merging ScanTasks when reading files from storage.
            Increasing this value will make Daft perform more merging of files into a single partition before yielding,
            which leads to bigger but fewer partitions. (Defaults to 96 MiB)
        scan_tasks_max_size_bytes: Maximum size in bytes when merging ScanTasks when reading files from storage.
            Increasing this value will increase the upper bound of the size of merged ScanTasks, which leads to bigger but
            fewer partitions. (Defaults to 384 MiB)
        max_sources_per_scan_task: Maximum number of sources in a single ScanTask. (Defaults to 10)
        broadcast_join_size_bytes_threshold: If one side of a join is smaller than this threshold, a broadcast join will be used.
            Default is 10 MiB.
        parquet_split_row_groups_max_files: Maximum number of files to read in which the row group splitting should happen. (Defaults to 10)
        sort_merge_join_sort_with_aligned_boundaries: Whether to use a specialized algorithm for sorting both sides of a
            sort-merge join such that they have aligned boundaries. This can lead to a faster merge-join at the cost of
            more skewed sorted join inputs, increasing the risk of OOMs.
        hash_join_partition_size_leniency: If the left side of a hash join is already correctly partitioned and the right side isn't,
            and the ratio between the left and right size is at least this value, then the right side is repartitioned to have an equal
            number of partitions as the left. Defaults to 0.5.
        sample_size_for_sort: number of elements to sample from each partition when running sort,
            Default is 20.
        num_preview_rows: number of rows to when showing a dataframe preview,
            Default is 8.
        parquet_target_filesize: Target File Size when writing out Parquet Files. Defaults to 512MB
        parquet_target_row_group_size: Target Row Group Size when writing out Parquet Files. Defaults to 128MB
        parquet_inflation_factor: Inflation Factor of parquet files (In-Memory-Size / File-Size) ratio. Defaults to 3.0
        csv_target_filesize: Target File Size when writing out CSV Files. Defaults to 512MB
        csv_inflation_factor: Inflation Factor of CSV files (In-Memory-Size / File-Size) ratio. Defaults to 0.5
        shuffle_aggregation_default_partitions: Maximum number of partitions to create when performing aggregations on the Ray Runner. Defaults to 200, unless the number of input partitions is less than 200.
        partial_aggregation_threshold: Threshold for performing partial aggregations on the Native Runner. Defaults to 10000 rows.
        high_cardinality_aggregation_threshold: Threshold selectivity for performing high cardinality aggregations on the Native Runner. Defaults to 0.8.
        read_sql_partition_size_bytes: Target size of partition when reading from SQL databases. Defaults to 512MB
        enable_aqe: Enables Adaptive Query Execution, Defaults to False
        enable_native_executor: Enables the native executor, Defaults to False
        default_morsel_size: Default size of morsels used for the new local executor. Defaults to 131072 rows.
        shuffle_algorithm: The shuffle algorithm to use. Defaults to "auto", which will let Daft determine the algorithm. Options are "map_reduce" and "pre_shuffle_merge".
        pre_shuffle_merge_threshold: Memory threshold in bytes for pre-shuffle merge. Defaults to 1GB
        flight_shuffle_dirs: The directories to use for flight shuffle. Defaults to ["/tmp"].
        enable_ray_tracing: Enable tracing for Ray. Accessible in `/tmp/ray/session_latest/logs/daft` after the run completes. Defaults to False.
        scantask_splitting_level: How aggressively to split scan tasks. Setting this to `2` will use a more aggressive ScanTask splitting algorithm which might be more expensive to run but results in more even splits of partitions. Defaults to 1.
    """
    # Replace values in the DaftExecutionConfig with user-specified overrides
    ctx = get_context()
    with ctx._lock:
        old_daft_execution_config = ctx._ctx._daft_execution_config if config is None else config

        new_daft_execution_config = old_daft_execution_config.with_config_values(
            scan_tasks_min_size_bytes=scan_tasks_min_size_bytes,
            scan_tasks_max_size_bytes=scan_tasks_max_size_bytes,
            max_sources_per_scan_task=max_sources_per_scan_task,
            broadcast_join_size_bytes_threshold=broadcast_join_size_bytes_threshold,
            parquet_split_row_groups_max_files=parquet_split_row_groups_max_files,
            sort_merge_join_sort_with_aligned_boundaries=sort_merge_join_sort_with_aligned_boundaries,
            hash_join_partition_size_leniency=hash_join_partition_size_leniency,
            sample_size_for_sort=sample_size_for_sort,
            num_preview_rows=num_preview_rows,
            parquet_target_filesize=parquet_target_filesize,
            parquet_target_row_group_size=parquet_target_row_group_size,
            parquet_inflation_factor=parquet_inflation_factor,
            csv_target_filesize=csv_target_filesize,
            csv_inflation_factor=csv_inflation_factor,
            shuffle_aggregation_default_partitions=shuffle_aggregation_default_partitions,
            partial_aggregation_threshold=partial_aggregation_threshold,
            high_cardinality_aggregation_threshold=high_cardinality_aggregation_threshold,
            read_sql_partition_size_bytes=read_sql_partition_size_bytes,
            enable_aqe=enable_aqe,
            enable_native_executor=enable_native_executor,
            default_morsel_size=default_morsel_size,
            shuffle_algorithm=shuffle_algorithm,
            flight_shuffle_dirs=flight_shuffle_dirs,
            pre_shuffle_merge_threshold=pre_shuffle_merge_threshold,
            enable_ray_tracing=enable_ray_tracing,
            scantask_splitting_level=scantask_splitting_level,
            flotilla=flotilla,
        )

        ctx._ctx._daft_execution_config = new_daft_execution_config
        return ctx

execution_config_ctx #

execution_config_ctx(**kwargs)

Context manager that wraps set_execution_config to reset the config to its original setting afternwards.

Source code in daft/context.py
172
173
174
175
176
177
178
179
180
@contextlib.contextmanager
def execution_config_ctx(**kwargs):
    """Context manager that wraps set_execution_config to reset the config to its original setting afternwards."""
    original_config = get_context()._ctx._daft_execution_config
    try:
        set_execution_config(**kwargs)
        yield
    finally:
        set_execution_config(config=original_config)

I/O Configurations#

Configure behavior when Daft interacts with storage (e.g. credentials, retry policies and various other knobs to control performance/resource usage)

These configurations are most often used as inputs to Daft DataFrame reading I/O functions such as in Dataframe Creation.

IOConfig #

IOConfig(
    s3: S3Config | None = None,
    azure: AzureConfig | None = None,
    gcs: GCSConfig | None = None,
    http: HTTPConfig | None = None,
)

Configuration for the native I/O layer, e.g. credentials for accessing cloud storage systems.

Methods:

Name Description
replace

Replaces values if provided, returning a new IOConfig.

Attributes:

Name Type Description
azure AzureConfig
gcs GCSConfig
http HTTPConfig
s3 S3Config
Source code in daft/daft/__init__.pyi
641
642
643
644
645
646
647
def __init__(
    self,
    s3: S3Config | None = None,
    azure: AzureConfig | None = None,
    gcs: GCSConfig | None = None,
    http: HTTPConfig | None = None,
): ...

azure #

azure: AzureConfig

gcs #

gcs: GCSConfig

http #

http: HTTPConfig

s3 #

replace #

replace(
    s3: S3Config | None = None,
    azure: AzureConfig | None = None,
    gcs: GCSConfig | None = None,
    http: HTTPConfig | None = None,
) -> IOConfig

Replaces values if provided, returning a new IOConfig.

Source code in daft/daft/__init__.pyi
648
649
650
651
652
653
654
655
656
def replace(
    self,
    s3: S3Config | None = None,
    azure: AzureConfig | None = None,
    gcs: GCSConfig | None = None,
    http: HTTPConfig | None = None,
) -> IOConfig:
    """Replaces values if provided, returning a new IOConfig."""
    ...

S3Config #

S3Config(
    region_name: str | None = None,
    endpoint_url: str | None = None,
    key_id: str | None = None,
    session_token: str | None = None,
    access_key: str | None = None,
    credentials_provider: Callable[[], S3Credentials]
    | None = None,
    buffer_time: int | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
    retry_mode: str | None = None,
    anonymous: bool | None = None,
    use_ssl: bool | None = None,
    verify_ssl: bool | None = None,
    check_hostname_ssl: bool | None = None,
    requester_pays: bool | None = None,
    force_virtual_addressing: bool | None = None,
    profile_name: str | None = None,
)

I/O configuration for accessing an S3-compatible system.

Methods:

Name Description
from_env

Creates an S3Config, retrieving credentials and configurations from the current environment.

provide_cached_credentials

Wrapper around call to S3Config.credentials_provider to cache credentials until expiry.

replace

Replaces values if provided, returning a new S3Config.

Attributes:

Name Type Description
access_key str | None
anonymous bool
check_hostname_ssl bool
connect_timeout_ms int
credentials_provider Callable[[], S3Credentials] | None
endpoint_url str | None
force_virtual_addressing bool | None
key_id str | None
max_connections int
num_tries int
profile_name str | None
read_timeout_ms int
region_name str | None
requester_pays bool | None
retry_initial_backoff_ms int
retry_mode str | None
session_token str | None
use_ssl bool
verify_ssl bool
Source code in daft/daft/__init__.pyi
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
def __init__(
    self,
    region_name: str | None = None,
    endpoint_url: str | None = None,
    key_id: str | None = None,
    session_token: str | None = None,
    access_key: str | None = None,
    credentials_provider: Callable[[], S3Credentials] | None = None,
    buffer_time: int | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
    retry_mode: str | None = None,
    anonymous: bool | None = None,
    use_ssl: bool | None = None,
    verify_ssl: bool | None = None,
    check_hostname_ssl: bool | None = None,
    requester_pays: bool | None = None,
    force_virtual_addressing: bool | None = None,
    profile_name: str | None = None,
): ...

access_key #

access_key: str | None

anonymous #

anonymous: bool

check_hostname_ssl #

check_hostname_ssl: bool

connect_timeout_ms #

connect_timeout_ms: int

credentials_provider #

credentials_provider: Callable[[], S3Credentials] | None

endpoint_url #

endpoint_url: str | None

force_virtual_addressing #

force_virtual_addressing: bool | None

key_id #

key_id: str | None

max_connections #

max_connections: int

num_tries #

num_tries: int

profile_name #

profile_name: str | None

read_timeout_ms #

read_timeout_ms: int

region_name #

region_name: str | None

requester_pays #

requester_pays: bool | None

retry_initial_backoff_ms #

retry_initial_backoff_ms: int

retry_mode #

retry_mode: str | None

session_token #

session_token: str | None

use_ssl #

use_ssl: bool

verify_ssl #

verify_ssl: bool

from_env #

from_env() -> S3Config

Creates an S3Config, retrieving credentials and configurations from the current environment.

Source code in daft/daft/__init__.pyi
524
525
526
527
@staticmethod
def from_env() -> S3Config:
    """Creates an S3Config, retrieving credentials and configurations from the current environment."""
    ...

provide_cached_credentials #

provide_cached_credentials() -> S3Credentials | None

Wrapper around call to S3Config.credentials_provider to cache credentials until expiry.

Source code in daft/daft/__init__.pyi
529
530
531
def provide_cached_credentials(self) -> S3Credentials | None:
    """Wrapper around call to `S3Config.credentials_provider` to cache credentials until expiry."""
    ...

replace #

replace(
    region_name: str | None = None,
    endpoint_url: str | None = None,
    key_id: str | None = None,
    session_token: str | None = None,
    access_key: str | None = None,
    credentials_provider: Callable[[], S3Credentials]
    | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
    retry_mode: str | None = None,
    anonymous: bool | None = None,
    use_ssl: bool | None = None,
    verify_ssl: bool | None = None,
    check_hostname_ssl: bool | None = None,
    requester_pays: bool | None = None,
    force_virtual_addressing: bool | None = None,
    profile_name: str | None = None,
) -> S3Config

Replaces values if provided, returning a new S3Config.

Source code in daft/daft/__init__.pyi
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
def replace(
    self,
    region_name: str | None = None,
    endpoint_url: str | None = None,
    key_id: str | None = None,
    session_token: str | None = None,
    access_key: str | None = None,
    credentials_provider: Callable[[], S3Credentials] | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
    retry_mode: str | None = None,
    anonymous: bool | None = None,
    use_ssl: bool | None = None,
    verify_ssl: bool | None = None,
    check_hostname_ssl: bool | None = None,
    requester_pays: bool | None = None,
    force_virtual_addressing: bool | None = None,
    profile_name: str | None = None,
) -> S3Config:
    """Replaces values if provided, returning a new S3Config."""
    ...

S3Credentials #

S3Credentials(
    key_id: str,
    access_key: str,
    session_token: str | None = None,
    expiry: datetime | None = None,
)

Attributes:

Name Type Description
access_key str
expiry datetime | None
key_id str
session_token str | None
Source code in daft/daft/__init__.pyi
539
540
541
542
543
544
545
def __init__(
    self,
    key_id: str,
    access_key: str,
    session_token: str | None = None,
    expiry: datetime.datetime | None = None,
): ...

access_key #

access_key: str

expiry #

expiry: datetime | None

key_id #

key_id: str

session_token #

session_token: str | None

GCSConfig #

GCSConfig(
    project_id: str | None = None,
    credentials: str | None = None,
    token: str | None = None,
    anonymous: bool | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
)

I/O configuration for accessing Google Cloud Storage.

Methods:

Name Description
replace

Replaces values if provided, returning a new GCSConfig.

Attributes:

Name Type Description
anonymous bool
connect_timeout_ms int
credentials str | None
max_connections int
num_tries int
project_id str | None
read_timeout_ms int
retry_initial_backoff_ms int
token str | None
Source code in daft/daft/__init__.pyi
606
607
608
609
610
611
612
613
614
615
616
617
def __init__(
    self,
    project_id: str | None = None,
    credentials: str | None = None,
    token: str | None = None,
    anonymous: bool | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
): ...

anonymous #

anonymous: bool

connect_timeout_ms #

connect_timeout_ms: int

credentials #

credentials: str | None

max_connections #

max_connections: int

num_tries #

num_tries: int

project_id #

project_id: str | None

read_timeout_ms #

read_timeout_ms: int

retry_initial_backoff_ms #

retry_initial_backoff_ms: int

token #

token: str | None

replace #

replace(
    project_id: str | None = None,
    credentials: str | None = None,
    token: str | None = None,
    anonymous: bool | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
) -> GCSConfig

Replaces values if provided, returning a new GCSConfig.

Source code in daft/daft/__init__.pyi
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def replace(
    self,
    project_id: str | None = None,
    credentials: str | None = None,
    token: str | None = None,
    anonymous: bool | None = None,
    max_connections: int | None = None,
    retry_initial_backoff_ms: int | None = None,
    connect_timeout_ms: int | None = None,
    read_timeout_ms: int | None = None,
    num_tries: int | None = None,
) -> GCSConfig:
    """Replaces values if provided, returning a new GCSConfig."""
    ...

AzureConfig #

AzureConfig(
    storage_account: str | None = None,
    access_key: str | None = None,
    sas_token: str | None = None,
    bearer_token: str | None = None,
    tenant_id: str | None = None,
    client_id: str | None = None,
    client_secret: str | None = None,
    use_fabric_endpoint: bool | None = None,
    anonymous: bool | None = None,
    endpoint_url: str | None = None,
    use_ssl: bool | None = None,
)

I/O configuration for accessing Azure Blob Storage.

Methods:

Name Description
replace

Replaces values if provided, returning a new AzureConfig.

Attributes:

Name Type Description
access_key str | None
anonymous bool | None
bearer_token str | None
client_id str | None
client_secret str | None
endpoint_url str | None
sas_token str | None
storage_account str | None
tenant_id str | None
use_fabric_endpoint bool | None
use_ssl bool | None
Source code in daft/daft/__init__.pyi
562
563
564
565
566
567
568
569
570
571
572
573
574
575
def __init__(
    self,
    storage_account: str | None = None,
    access_key: str | None = None,
    sas_token: str | None = None,
    bearer_token: str | None = None,
    tenant_id: str | None = None,
    client_id: str | None = None,
    client_secret: str | None = None,
    use_fabric_endpoint: bool | None = None,
    anonymous: bool | None = None,
    endpoint_url: str | None = None,
    use_ssl: bool | None = None,
): ...

access_key #

access_key: str | None

anonymous #

anonymous: bool | None

bearer_token #

bearer_token: str | None

client_id #

client_id: str | None

client_secret #

client_secret: str | None

endpoint_url #

endpoint_url: str | None = None

sas_token #

sas_token: str | None

storage_account #

storage_account: str | None

tenant_id #

tenant_id: str | None

use_fabric_endpoint #

use_fabric_endpoint: bool | None

use_ssl #

use_ssl: bool | None = None

replace #

replace(
    storage_account: str | None = None,
    access_key: str | None = None,
    sas_token: str | None = None,
    bearer_token: str | None = None,
    tenant_id: str | None = None,
    client_id: str | None = None,
    client_secret: str | None = None,
    use_fabric_endpoint: bool | None = None,
    anonymous: bool | None = None,
    endpoint_url: str | None = None,
    use_ssl: bool | None = None,
) -> AzureConfig

Replaces values if provided, returning a new AzureConfig.

Source code in daft/daft/__init__.pyi
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
def replace(
    self,
    storage_account: str | None = None,
    access_key: str | None = None,
    sas_token: str | None = None,
    bearer_token: str | None = None,
    tenant_id: str | None = None,
    client_id: str | None = None,
    client_secret: str | None = None,
    use_fabric_endpoint: bool | None = None,
    anonymous: bool | None = None,
    endpoint_url: str | None = None,
    use_ssl: bool | None = None,
) -> AzureConfig:
    """Replaces values if provided, returning a new AzureConfig."""
    ...