reconnect moved files to git repo
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
516
venv/lib/python3.11/site-packages/narwhals/_duckdb/dataframe.py
Normal file
516
venv/lib/python3.11/site-packages/narwhals/_duckdb/dataframe.py
Normal file
@ -0,0 +1,516 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import reduce
|
||||
from operator import and_
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import duckdb
|
||||
from duckdb import StarExpression
|
||||
|
||||
from narwhals._duckdb.utils import (
|
||||
DeferredTimeZone,
|
||||
F,
|
||||
catch_duckdb_exception,
|
||||
col,
|
||||
evaluate_exprs,
|
||||
lit,
|
||||
native_to_narwhals_dtype,
|
||||
window_expression,
|
||||
)
|
||||
from narwhals._sql.dataframe import SQLLazyFrame
|
||||
from narwhals._utils import (
|
||||
Implementation,
|
||||
ValidateBackendVersion,
|
||||
Version,
|
||||
generate_temporary_column_name,
|
||||
not_implemented,
|
||||
parse_columns_to_drop,
|
||||
requires,
|
||||
)
|
||||
from narwhals.dependencies import get_duckdb
|
||||
from narwhals.exceptions import InvalidOperationError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator, Mapping, Sequence
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from types import ModuleType
|
||||
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from duckdb import Expression
|
||||
from duckdb.typing import DuckDBPyType
|
||||
from typing_extensions import Self, TypeIs
|
||||
|
||||
from narwhals._compliant.typing import CompliantDataFrameAny
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
from narwhals._duckdb.group_by import DuckDBGroupBy
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
from narwhals._duckdb.series import DuckDBInterchangeSeries
|
||||
from narwhals._utils import _LimitedContext
|
||||
from narwhals.dataframe import LazyFrame
|
||||
from narwhals.dtypes import DType
|
||||
from narwhals.stable.v1 import DataFrame as DataFrameV1
|
||||
from narwhals.typing import AsofJoinStrategy, JoinStrategy, LazyUniqueKeepStrategy
|
||||
|
||||
|
||||
class DuckDBLazyFrame(
|
||||
SQLLazyFrame[
|
||||
"DuckDBExpr",
|
||||
"duckdb.DuckDBPyRelation",
|
||||
"LazyFrame[duckdb.DuckDBPyRelation] | DataFrameV1[duckdb.DuckDBPyRelation]",
|
||||
],
|
||||
ValidateBackendVersion,
|
||||
):
|
||||
_implementation = Implementation.DUCKDB
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
df: duckdb.DuckDBPyRelation,
|
||||
*,
|
||||
version: Version,
|
||||
validate_backend_version: bool = False,
|
||||
) -> None:
|
||||
self._native_frame: duckdb.DuckDBPyRelation = df
|
||||
self._version = version
|
||||
self._cached_native_schema: dict[str, DuckDBPyType] | None = None
|
||||
self._cached_columns: list[str] | None = None
|
||||
if validate_backend_version:
|
||||
self._validate_backend_version()
|
||||
|
||||
@property
|
||||
def _backend_version(self) -> tuple[int, ...]:
|
||||
return self._implementation._backend_version()
|
||||
|
||||
@staticmethod
|
||||
def _is_native(obj: duckdb.DuckDBPyRelation | Any) -> TypeIs[duckdb.DuckDBPyRelation]:
|
||||
return isinstance(obj, duckdb.DuckDBPyRelation)
|
||||
|
||||
@classmethod
|
||||
def from_native(
|
||||
cls, data: duckdb.DuckDBPyRelation, /, *, context: _LimitedContext
|
||||
) -> Self:
|
||||
return cls(data, version=context._version)
|
||||
|
||||
def to_narwhals(
|
||||
self, *args: Any, **kwds: Any
|
||||
) -> LazyFrame[duckdb.DuckDBPyRelation] | DataFrameV1[duckdb.DuckDBPyRelation]:
|
||||
if self._version is Version.V1:
|
||||
from narwhals.stable.v1 import DataFrame as DataFrameV1
|
||||
|
||||
return DataFrameV1(self, level="interchange") # type: ignore[no-any-return]
|
||||
return self._version.lazyframe(self, level="lazy")
|
||||
|
||||
def __narwhals_dataframe__(self) -> Self: # pragma: no cover
|
||||
# Keep around for backcompat.
|
||||
if self._version is not Version.V1:
|
||||
msg = "__narwhals_dataframe__ is not implemented for DuckDBLazyFrame"
|
||||
raise AttributeError(msg)
|
||||
return self
|
||||
|
||||
def __narwhals_lazyframe__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __native_namespace__(self) -> ModuleType:
|
||||
return get_duckdb() # type: ignore[no-any-return]
|
||||
|
||||
def __narwhals_namespace__(self) -> DuckDBNamespace:
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
|
||||
return DuckDBNamespace(version=self._version)
|
||||
|
||||
def get_column(self, name: str) -> DuckDBInterchangeSeries:
|
||||
from narwhals._duckdb.series import DuckDBInterchangeSeries
|
||||
|
||||
return DuckDBInterchangeSeries(self.native.select(name), version=self._version)
|
||||
|
||||
def _iter_columns(self) -> Iterator[Expression]:
|
||||
for name in self.columns:
|
||||
yield col(name)
|
||||
|
||||
def collect(
|
||||
self, backend: ModuleType | Implementation | str | None, **kwargs: Any
|
||||
) -> CompliantDataFrameAny:
|
||||
if backend is None or backend is Implementation.PYARROW:
|
||||
from narwhals._arrow.dataframe import ArrowDataFrame
|
||||
|
||||
return ArrowDataFrame(
|
||||
self.native.arrow(),
|
||||
validate_backend_version=True,
|
||||
version=self._version,
|
||||
validate_column_names=True,
|
||||
)
|
||||
|
||||
if backend is Implementation.PANDAS:
|
||||
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
||||
|
||||
return PandasLikeDataFrame(
|
||||
self.native.df(),
|
||||
implementation=Implementation.PANDAS,
|
||||
validate_backend_version=True,
|
||||
version=self._version,
|
||||
validate_column_names=True,
|
||||
)
|
||||
|
||||
if backend is Implementation.POLARS:
|
||||
from narwhals._polars.dataframe import PolarsDataFrame
|
||||
|
||||
return PolarsDataFrame(
|
||||
self.native.pl(), validate_backend_version=True, version=self._version
|
||||
)
|
||||
|
||||
msg = f"Unsupported `backend` value: {backend}" # pragma: no cover
|
||||
raise ValueError(msg) # pragma: no cover
|
||||
|
||||
def head(self, n: int) -> Self:
|
||||
return self._with_native(self.native.limit(n))
|
||||
|
||||
def simple_select(self, *column_names: str) -> Self:
|
||||
return self._with_native(self.native.select(*column_names))
|
||||
|
||||
def aggregate(self, *exprs: DuckDBExpr) -> Self:
|
||||
selection = [val.alias(name) for name, val in evaluate_exprs(self, *exprs)]
|
||||
try:
|
||||
return self._with_native(self.native.aggregate(selection)) # type: ignore[arg-type]
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
def select(self, *exprs: DuckDBExpr) -> Self:
|
||||
selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs))
|
||||
try:
|
||||
return self._with_native(self.native.select(*selection))
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
def drop(self, columns: Sequence[str], *, strict: bool) -> Self:
|
||||
columns_to_drop = parse_columns_to_drop(self, columns, strict=strict)
|
||||
selection = (name for name in self.columns if name not in columns_to_drop)
|
||||
return self._with_native(self.native.select(*selection))
|
||||
|
||||
def lazy(self, *, backend: Implementation | None = None) -> Self:
|
||||
# The `backend`` argument has no effect but we keep it here for
|
||||
# backwards compatibility because in `narwhals.stable.v1`
|
||||
# function `.from_native()` will return a DataFrame for DuckDB.
|
||||
|
||||
if backend is not None: # pragma: no cover
|
||||
msg = "`backend` argument is not supported for DuckDB"
|
||||
raise ValueError(msg)
|
||||
return self
|
||||
|
||||
def with_columns(self, *exprs: DuckDBExpr) -> Self:
|
||||
new_columns_map = dict(evaluate_exprs(self, *exprs))
|
||||
result = [
|
||||
new_columns_map.pop(name).alias(name)
|
||||
if name in new_columns_map
|
||||
else col(name)
|
||||
for name in self.columns
|
||||
]
|
||||
result.extend(value.alias(name) for name, value in new_columns_map.items())
|
||||
try:
|
||||
return self._with_native(self.native.select(*result))
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
def filter(self, predicate: DuckDBExpr) -> Self:
|
||||
# `[0]` is safe as the predicate's expression only returns a single column
|
||||
mask = predicate(self)[0]
|
||||
try:
|
||||
return self._with_native(self.native.filter(mask))
|
||||
except Exception as e: # noqa: BLE001
|
||||
raise catch_duckdb_exception(e, self) from None
|
||||
|
||||
@property
|
||||
def schema(self) -> dict[str, DType]:
|
||||
if self._cached_native_schema is None:
|
||||
# Note: prefer `self._cached_native_schema` over `functools.cached_property`
|
||||
# due to Python3.13 failures.
|
||||
self._cached_native_schema = dict(zip(self.columns, self.native.types))
|
||||
|
||||
deferred_time_zone = DeferredTimeZone(self.native)
|
||||
return {
|
||||
column_name: native_to_narwhals_dtype(
|
||||
duckdb_dtype, self._version, deferred_time_zone
|
||||
)
|
||||
for column_name, duckdb_dtype in zip(self.native.columns, self.native.types)
|
||||
}
|
||||
|
||||
@property
|
||||
def columns(self) -> list[str]:
|
||||
if self._cached_columns is None:
|
||||
self._cached_columns = (
|
||||
list(self.schema)
|
||||
if self._cached_native_schema is not None
|
||||
else self.native.columns
|
||||
)
|
||||
return self._cached_columns
|
||||
|
||||
def to_pandas(self) -> pd.DataFrame:
|
||||
# only if version is v1, keep around for backcompat
|
||||
return self.native.df()
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
# only if version is v1, keep around for backcompat
|
||||
return self.native.arrow()
|
||||
|
||||
def _with_version(self, version: Version) -> Self:
|
||||
return self.__class__(self.native, version=version)
|
||||
|
||||
def _with_native(self, df: duckdb.DuckDBPyRelation) -> Self:
|
||||
return self.__class__(df, version=self._version)
|
||||
|
||||
def group_by(
|
||||
self, keys: Sequence[str] | Sequence[DuckDBExpr], *, drop_null_keys: bool
|
||||
) -> DuckDBGroupBy:
|
||||
from narwhals._duckdb.group_by import DuckDBGroupBy
|
||||
|
||||
return DuckDBGroupBy(self, keys, drop_null_keys=drop_null_keys)
|
||||
|
||||
def rename(self, mapping: Mapping[str, str]) -> Self:
|
||||
df = self.native
|
||||
selection = (
|
||||
col(name).alias(mapping[name]) if name in mapping else col(name)
|
||||
for name in df.columns
|
||||
)
|
||||
return self._with_native(self.native.select(*selection))
|
||||
|
||||
def join(
|
||||
self,
|
||||
other: Self,
|
||||
*,
|
||||
how: JoinStrategy,
|
||||
left_on: Sequence[str] | None,
|
||||
right_on: Sequence[str] | None,
|
||||
suffix: str,
|
||||
) -> Self:
|
||||
native_how = "outer" if how == "full" else how
|
||||
|
||||
if native_how == "cross":
|
||||
if self._backend_version < (1, 1, 4):
|
||||
msg = f"'duckdb>=1.1.4' is required for cross-join, found version: {self._backend_version}"
|
||||
raise NotImplementedError(msg)
|
||||
rel = self.native.set_alias("lhs").cross(other.native.set_alias("rhs"))
|
||||
else:
|
||||
# help mypy
|
||||
assert left_on is not None # noqa: S101
|
||||
assert right_on is not None # noqa: S101
|
||||
it = (
|
||||
col(f'lhs."{left}"') == col(f'rhs."{right}"')
|
||||
for left, right in zip(left_on, right_on)
|
||||
)
|
||||
condition: Expression = reduce(and_, it)
|
||||
rel = self.native.set_alias("lhs").join(
|
||||
other.native.set_alias("rhs"),
|
||||
# NOTE: Fixed in `--pre` https://github.com/duckdb/duckdb/pull/16933
|
||||
condition=condition, # type: ignore[arg-type, unused-ignore]
|
||||
how=native_how,
|
||||
)
|
||||
|
||||
if native_how in {"inner", "left", "cross", "outer"}:
|
||||
select = [col(f'lhs."{x}"') for x in self.columns]
|
||||
for name in other.columns:
|
||||
col_in_lhs: bool = name in self.columns
|
||||
if native_how == "outer" and not col_in_lhs:
|
||||
select.append(col(f'rhs."{name}"'))
|
||||
elif (native_how == "outer") or (
|
||||
col_in_lhs and (right_on is None or name not in right_on)
|
||||
):
|
||||
select.append(col(f'rhs."{name}"').alias(f"{name}{suffix}"))
|
||||
elif right_on is None or name not in right_on:
|
||||
select.append(col(name))
|
||||
res = rel.select(*select).set_alias(self.native.alias)
|
||||
else: # semi, anti
|
||||
res = rel.select("lhs.*").set_alias(self.native.alias)
|
||||
|
||||
return self._with_native(res)
|
||||
|
||||
def join_asof(
|
||||
self,
|
||||
other: Self,
|
||||
*,
|
||||
left_on: str,
|
||||
right_on: str,
|
||||
by_left: Sequence[str] | None,
|
||||
by_right: Sequence[str] | None,
|
||||
strategy: AsofJoinStrategy,
|
||||
suffix: str,
|
||||
) -> Self:
|
||||
lhs = self.native
|
||||
rhs = other.native
|
||||
conditions: list[Expression] = []
|
||||
if by_left is not None and by_right is not None:
|
||||
conditions.extend(
|
||||
col(f'lhs."{left}"') == col(f'rhs."{right}"')
|
||||
for left, right in zip(by_left, by_right)
|
||||
)
|
||||
else:
|
||||
by_left = by_right = []
|
||||
if strategy == "backward":
|
||||
conditions.append(col(f'lhs."{left_on}"') >= col(f'rhs."{right_on}"'))
|
||||
elif strategy == "forward":
|
||||
conditions.append(col(f'lhs."{left_on}"') <= col(f'rhs."{right_on}"'))
|
||||
else:
|
||||
msg = "Only 'backward' and 'forward' strategies are currently supported for DuckDB"
|
||||
raise NotImplementedError(msg)
|
||||
condition: Expression = reduce(and_, conditions)
|
||||
select = ["lhs.*"]
|
||||
for name in rhs.columns:
|
||||
if name in lhs.columns and (
|
||||
right_on is None or name not in {right_on, *by_right}
|
||||
):
|
||||
select.append(f'rhs."{name}" as "{name}{suffix}"')
|
||||
elif right_on is None or name not in {right_on, *by_right}:
|
||||
select.append(str(col(name)))
|
||||
# Replace with Python API call once
|
||||
# https://github.com/duckdb/duckdb/discussions/16947 is addressed.
|
||||
query = f"""
|
||||
SELECT {",".join(select)}
|
||||
FROM lhs
|
||||
ASOF LEFT JOIN rhs
|
||||
ON {condition}
|
||||
""" # noqa: S608
|
||||
return self._with_native(duckdb.sql(query))
|
||||
|
||||
def collect_schema(self) -> dict[str, DType]:
|
||||
return self.schema
|
||||
|
||||
def unique(
|
||||
self, subset: Sequence[str] | None, *, keep: LazyUniqueKeepStrategy
|
||||
) -> Self:
|
||||
if subset_ := subset if keep == "any" else (subset or self.columns):
|
||||
# Sanitise input
|
||||
if error := self._check_columns_exist(subset_):
|
||||
raise error
|
||||
idx_name = generate_temporary_column_name(8, self.columns)
|
||||
count_name = generate_temporary_column_name(8, [*self.columns, idx_name])
|
||||
name = count_name if keep == "none" else idx_name
|
||||
idx_expr = window_expression(F("row_number"), subset_).alias(idx_name)
|
||||
count_expr = window_expression(
|
||||
F("count", StarExpression()), subset_, ()
|
||||
).alias(count_name)
|
||||
return self._with_native(
|
||||
self.native.select(StarExpression(), idx_expr, count_expr)
|
||||
.filter(col(name) == lit(1))
|
||||
.select(StarExpression(exclude=[count_name, idx_name]))
|
||||
)
|
||||
return self._with_native(self.native.unique(", ".join(self.columns)))
|
||||
|
||||
def sort(self, *by: str, descending: bool | Sequence[bool], nulls_last: bool) -> Self:
|
||||
if isinstance(descending, bool):
|
||||
descending = [descending] * len(by)
|
||||
if nulls_last:
|
||||
it = (
|
||||
col(name).nulls_last() if not desc else col(name).desc().nulls_last()
|
||||
for name, desc in zip(by, descending)
|
||||
)
|
||||
else:
|
||||
it = (
|
||||
col(name).nulls_first() if not desc else col(name).desc().nulls_first()
|
||||
for name, desc in zip(by, descending)
|
||||
)
|
||||
return self._with_native(self.native.sort(*it))
|
||||
|
||||
def drop_nulls(self, subset: Sequence[str] | None) -> Self:
|
||||
subset_ = subset if subset is not None else self.columns
|
||||
keep_condition = reduce(and_, (col(name).isnotnull() for name in subset_))
|
||||
return self._with_native(self.native.filter(keep_condition))
|
||||
|
||||
def explode(self, columns: Sequence[str]) -> Self:
|
||||
dtypes = self._version.dtypes
|
||||
schema = self.collect_schema()
|
||||
for name in columns:
|
||||
dtype = schema[name]
|
||||
if dtype != dtypes.List:
|
||||
msg = (
|
||||
f"`explode` operation not supported for dtype `{dtype}`, "
|
||||
"expected List type"
|
||||
)
|
||||
raise InvalidOperationError(msg)
|
||||
|
||||
if len(columns) != 1:
|
||||
msg = (
|
||||
"Exploding on multiple columns is not supported with DuckDB backend since "
|
||||
"we cannot guarantee that the exploded columns have matching element counts."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
col_to_explode = col(columns[0])
|
||||
rel = self.native
|
||||
original_columns = self.columns
|
||||
|
||||
not_null_condition = col_to_explode.isnotnull() & F("len", col_to_explode) > lit(
|
||||
0
|
||||
)
|
||||
non_null_rel = rel.filter(not_null_condition).select(
|
||||
*(
|
||||
F("unnest", col_to_explode).alias(name) if name in columns else name
|
||||
for name in original_columns
|
||||
)
|
||||
)
|
||||
|
||||
null_rel = rel.filter(~not_null_condition).select(
|
||||
*(
|
||||
lit(None).alias(name) if name in columns else name
|
||||
for name in original_columns
|
||||
)
|
||||
)
|
||||
|
||||
return self._with_native(non_null_rel.union(null_rel))
|
||||
|
||||
def unpivot(
|
||||
self,
|
||||
on: Sequence[str] | None,
|
||||
index: Sequence[str] | None,
|
||||
variable_name: str,
|
||||
value_name: str,
|
||||
) -> Self:
|
||||
index_ = [] if index is None else index
|
||||
on_ = [c for c in self.columns if c not in index_] if on is None else on
|
||||
|
||||
if variable_name == "":
|
||||
msg = "`variable_name` cannot be empty string for duckdb backend."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
if value_name == "":
|
||||
msg = "`value_name` cannot be empty string for duckdb backend."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
unpivot_on = ", ".join(str(col(name)) for name in on_)
|
||||
rel = self.native # noqa: F841
|
||||
# Replace with Python API once
|
||||
# https://github.com/duckdb/duckdb/discussions/16980 is addressed.
|
||||
query = f"""
|
||||
unpivot rel
|
||||
on {unpivot_on}
|
||||
into
|
||||
name "{variable_name}"
|
||||
value "{value_name}"
|
||||
"""
|
||||
return self._with_native(
|
||||
duckdb.sql(query).select(*[*index_, variable_name, value_name])
|
||||
)
|
||||
|
||||
@requires.backend_version((1, 3))
|
||||
def with_row_index(self, name: str, order_by: Sequence[str]) -> Self:
|
||||
if order_by is None:
|
||||
msg = "Cannot pass `order_by` to `with_row_index` for DuckDB"
|
||||
raise TypeError(msg)
|
||||
expr = (window_expression(F("row_number"), order_by=order_by) - lit(1)).alias(
|
||||
name
|
||||
)
|
||||
return self._with_native(self.native.select(expr, StarExpression()))
|
||||
|
||||
def sink_parquet(self, file: str | Path | BytesIO) -> None:
|
||||
df = self.native # noqa: F841
|
||||
query = f"""
|
||||
COPY (SELECT * FROM df)
|
||||
TO '{file}'
|
||||
(FORMAT parquet)
|
||||
""" # noqa: S608
|
||||
duckdb.sql(query)
|
||||
|
||||
gather_every = not_implemented.deprecated(
|
||||
"`LazyFrame.gather_every` is deprecated and will be removed in a future version."
|
||||
)
|
||||
tail = not_implemented.deprecated(
|
||||
"`LazyFrame.tail` is deprecated and will be removed in a future version."
|
||||
)
|
||||
326
venv/lib/python3.11/site-packages/narwhals/_duckdb/expr.py
Normal file
326
venv/lib/python3.11/site-packages/narwhals/_duckdb/expr.py
Normal file
@ -0,0 +1,326 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
||||
|
||||
from duckdb import CoalesceOperator, StarExpression
|
||||
from duckdb.typing import DuckDBPyType
|
||||
|
||||
from narwhals._duckdb.expr_dt import DuckDBExprDateTimeNamespace
|
||||
from narwhals._duckdb.expr_list import DuckDBExprListNamespace
|
||||
from narwhals._duckdb.expr_str import DuckDBExprStringNamespace
|
||||
from narwhals._duckdb.expr_struct import DuckDBExprStructNamespace
|
||||
from narwhals._duckdb.utils import (
|
||||
DeferredTimeZone,
|
||||
F,
|
||||
col,
|
||||
lit,
|
||||
narwhals_to_native_dtype,
|
||||
when,
|
||||
window_expression,
|
||||
)
|
||||
from narwhals._expression_parsing import ExprKind, ExprMetadata
|
||||
from narwhals._sql.expr import SQLExpr
|
||||
from narwhals._utils import Implementation, Version
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression
|
||||
from typing_extensions import Self
|
||||
|
||||
from narwhals._compliant import WindowInputs
|
||||
from narwhals._compliant.typing import (
|
||||
AliasNames,
|
||||
EvalNames,
|
||||
EvalSeries,
|
||||
WindowFunction,
|
||||
)
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
from narwhals._utils import _LimitedContext
|
||||
from narwhals.typing import (
|
||||
FillNullStrategy,
|
||||
IntoDType,
|
||||
NonNestedLiteral,
|
||||
RollingInterpolationMethod,
|
||||
)
|
||||
|
||||
DuckDBWindowFunction = WindowFunction[DuckDBLazyFrame, Expression]
|
||||
DuckDBWindowInputs = WindowInputs[Expression]
|
||||
|
||||
|
||||
class DuckDBExpr(SQLExpr["DuckDBLazyFrame", "Expression"]):
|
||||
_implementation = Implementation.DUCKDB
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
call: EvalSeries[DuckDBLazyFrame, Expression],
|
||||
window_function: DuckDBWindowFunction | None = None,
|
||||
*,
|
||||
evaluate_output_names: EvalNames[DuckDBLazyFrame],
|
||||
alias_output_names: AliasNames | None,
|
||||
version: Version,
|
||||
implementation: Implementation = Implementation.DUCKDB,
|
||||
) -> None:
|
||||
self._call = call
|
||||
self._evaluate_output_names = evaluate_output_names
|
||||
self._alias_output_names = alias_output_names
|
||||
self._version = version
|
||||
self._metadata: ExprMetadata | None = None
|
||||
self._window_function: DuckDBWindowFunction | None = window_function
|
||||
|
||||
def _count_star(self) -> Expression:
|
||||
return F("count", StarExpression())
|
||||
|
||||
def _window_expression(
|
||||
self,
|
||||
expr: Expression,
|
||||
partition_by: Sequence[str | Expression] = (),
|
||||
order_by: Sequence[str | Expression] = (),
|
||||
rows_start: int | None = None,
|
||||
rows_end: int | None = None,
|
||||
*,
|
||||
descending: Sequence[bool] | None = None,
|
||||
nulls_last: Sequence[bool] | None = None,
|
||||
) -> Expression:
|
||||
return window_expression(
|
||||
expr,
|
||||
partition_by,
|
||||
order_by,
|
||||
rows_start,
|
||||
rows_end,
|
||||
descending=descending,
|
||||
nulls_last=nulls_last,
|
||||
)
|
||||
|
||||
def __narwhals_expr__(self) -> None: ...
|
||||
|
||||
def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover
|
||||
from narwhals._duckdb.namespace import DuckDBNamespace
|
||||
|
||||
return DuckDBNamespace(version=self._version)
|
||||
|
||||
def broadcast(self, kind: Literal[ExprKind.AGGREGATION, ExprKind.LITERAL]) -> Self:
|
||||
if kind is ExprKind.LITERAL:
|
||||
return self
|
||||
if self._backend_version < (1, 3):
|
||||
msg = "At least version 1.3 of DuckDB is required for binary operations between aggregates and columns."
|
||||
raise NotImplementedError(msg)
|
||||
return self.over([lit(1)], [])
|
||||
|
||||
@classmethod
|
||||
def from_column_names(
|
||||
cls,
|
||||
evaluate_column_names: EvalNames[DuckDBLazyFrame],
|
||||
/,
|
||||
*,
|
||||
context: _LimitedContext,
|
||||
) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
return [col(name) for name in evaluate_column_names(df)]
|
||||
|
||||
return cls(
|
||||
func,
|
||||
evaluate_output_names=evaluate_column_names,
|
||||
alias_output_names=None,
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_column_indices(cls, *column_indices: int, context: _LimitedContext) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
columns = df.columns
|
||||
return [col(columns[i]) for i in column_indices]
|
||||
|
||||
return cls(
|
||||
func,
|
||||
evaluate_output_names=cls._eval_names_indices(column_indices),
|
||||
alias_output_names=None,
|
||||
version=context._version,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _alias_native(cls, expr: Expression, name: str) -> Expression:
|
||||
return expr.alias(name)
|
||||
|
||||
def __invert__(self) -> Self:
|
||||
invert = cast("Callable[..., Expression]", operator.invert)
|
||||
return self._with_elementwise(invert)
|
||||
|
||||
def skew(self) -> Self:
|
||||
def func(expr: Expression) -> Expression:
|
||||
count = F("count", expr)
|
||||
# Adjust population skewness by correction factor to get sample skewness
|
||||
sample_skewness = (
|
||||
F("skewness", expr)
|
||||
* (count - lit(2))
|
||||
/ F("sqrt", count * (count - lit(1)))
|
||||
)
|
||||
return when(count == lit(0), lit(None)).otherwise(
|
||||
when(count == lit(1), lit(float("nan"))).otherwise(
|
||||
when(count == lit(2), lit(0.0)).otherwise(sample_skewness)
|
||||
)
|
||||
)
|
||||
|
||||
return self._with_callable(func)
|
||||
|
||||
def kurtosis(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("kurtosis_pop", expr))
|
||||
|
||||
def quantile(
|
||||
self, quantile: float, interpolation: RollingInterpolationMethod
|
||||
) -> Self:
|
||||
def func(expr: Expression) -> Expression:
|
||||
if interpolation == "linear":
|
||||
return F("quantile_cont", expr, lit(quantile))
|
||||
msg = "Only linear interpolation methods are supported for DuckDB quantile."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
return self._with_callable(func)
|
||||
|
||||
def n_unique(self) -> Self:
|
||||
def func(expr: Expression) -> Expression:
|
||||
# https://stackoverflow.com/a/79338887/4451315
|
||||
return F("array_unique", F("array_agg", expr)) + F(
|
||||
"max", when(expr.isnotnull(), lit(0)).otherwise(lit(1))
|
||||
)
|
||||
|
||||
return self._with_callable(func)
|
||||
|
||||
def len(self) -> Self:
|
||||
return self._with_callable(lambda _expr: F("count"))
|
||||
|
||||
def std(self, ddof: int) -> Self:
|
||||
if ddof == 0:
|
||||
return self._with_callable(lambda expr: F("stddev_pop", expr))
|
||||
if ddof == 1:
|
||||
return self._with_callable(lambda expr: F("stddev_samp", expr))
|
||||
|
||||
def _std(expr: Expression) -> Expression:
|
||||
n_samples = F("count", expr)
|
||||
return (
|
||||
F("stddev_pop", expr)
|
||||
* F("sqrt", n_samples)
|
||||
/ (F("sqrt", (n_samples - lit(ddof))))
|
||||
)
|
||||
|
||||
return self._with_callable(_std)
|
||||
|
||||
def var(self, ddof: int) -> Self:
|
||||
if ddof == 0:
|
||||
return self._with_callable(lambda expr: F("var_pop", expr))
|
||||
if ddof == 1:
|
||||
return self._with_callable(lambda expr: F("var_samp", expr))
|
||||
|
||||
def _var(expr: Expression) -> Expression:
|
||||
n_samples = F("count", expr)
|
||||
return F("var_pop", expr) * n_samples / (n_samples - lit(ddof))
|
||||
|
||||
return self._with_callable(_var)
|
||||
|
||||
def null_count(self) -> Self:
|
||||
return self._with_callable(lambda expr: F("sum", expr.isnull().cast("int")))
|
||||
|
||||
def is_nan(self) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("isnan", expr))
|
||||
|
||||
def is_finite(self) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("isfinite", expr))
|
||||
|
||||
def is_in(self, other: Sequence[Any]) -> Self:
|
||||
return self._with_elementwise(lambda expr: F("contains", lit(other), expr))
|
||||
|
||||
def fill_null(
|
||||
self,
|
||||
value: Self | NonNestedLiteral,
|
||||
strategy: FillNullStrategy | None,
|
||||
limit: int | None,
|
||||
) -> Self:
|
||||
if strategy is not None:
|
||||
if self._backend_version < (1, 3): # pragma: no cover
|
||||
msg = f"`fill_null` with `strategy={strategy}` is only available in 'duckdb>=1.3.0'."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
def _fill_with_strategy(
|
||||
df: DuckDBLazyFrame, inputs: DuckDBWindowInputs
|
||||
) -> Sequence[Expression]:
|
||||
fill_func = "last_value" if strategy == "forward" else "first_value"
|
||||
rows_start, rows_end = (
|
||||
(-limit if limit is not None else None, 0)
|
||||
if strategy == "forward"
|
||||
else (0, limit)
|
||||
)
|
||||
return [
|
||||
window_expression(
|
||||
F(fill_func, expr),
|
||||
inputs.partition_by,
|
||||
inputs.order_by,
|
||||
rows_start=rows_start,
|
||||
rows_end=rows_end,
|
||||
ignore_nulls=True,
|
||||
)
|
||||
for expr in self(df)
|
||||
]
|
||||
|
||||
return self._with_window_function(_fill_with_strategy)
|
||||
|
||||
def _fill_constant(expr: Expression, value: Any) -> Expression:
|
||||
return CoalesceOperator(expr, value)
|
||||
|
||||
return self._with_elementwise(_fill_constant, value=value)
|
||||
|
||||
def cast(self, dtype: IntoDType) -> Self:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
tz = DeferredTimeZone(df.native)
|
||||
native_dtype = narwhals_to_native_dtype(dtype, self._version, tz)
|
||||
return [expr.cast(DuckDBPyType(native_dtype)) for expr in self(df)]
|
||||
|
||||
def window_f(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> list[Expression]:
|
||||
tz = DeferredTimeZone(df.native)
|
||||
native_dtype = narwhals_to_native_dtype(dtype, self._version, tz)
|
||||
return [
|
||||
expr.cast(DuckDBPyType(native_dtype))
|
||||
for expr in self.window_function(df, inputs)
|
||||
]
|
||||
|
||||
return self.__class__(
|
||||
func,
|
||||
window_f,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def log(self, base: float) -> Self:
|
||||
def _log(expr: Expression) -> Expression:
|
||||
log = F("log", expr)
|
||||
return (
|
||||
when(expr < lit(0), lit(float("nan")))
|
||||
.when(expr == lit(0), lit(float("-inf")))
|
||||
.otherwise(log / F("log", lit(base)))
|
||||
)
|
||||
|
||||
return self._with_elementwise(_log)
|
||||
|
||||
def sqrt(self) -> Self:
|
||||
def _sqrt(expr: Expression) -> Expression:
|
||||
return when(expr < lit(0), lit(float("nan"))).otherwise(F("sqrt", expr))
|
||||
|
||||
return self._with_elementwise(_sqrt)
|
||||
|
||||
@property
|
||||
def str(self) -> DuckDBExprStringNamespace:
|
||||
return DuckDBExprStringNamespace(self)
|
||||
|
||||
@property
|
||||
def dt(self) -> DuckDBExprDateTimeNamespace:
|
||||
return DuckDBExprDateTimeNamespace(self)
|
||||
|
||||
@property
|
||||
def list(self) -> DuckDBExprListNamespace:
|
||||
return DuckDBExprListNamespace(self)
|
||||
|
||||
@property
|
||||
def struct(self) -> DuckDBExprStructNamespace:
|
||||
return DuckDBExprStructNamespace(self)
|
||||
157
venv/lib/python3.11/site-packages/narwhals/_duckdb/expr_dt.py
Normal file
157
venv/lib/python3.11/site-packages/narwhals/_duckdb/expr_dt.py
Normal file
@ -0,0 +1,157 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import DateTimeNamespace
|
||||
from narwhals._constants import (
|
||||
MS_PER_MINUTE,
|
||||
MS_PER_SECOND,
|
||||
NS_PER_SECOND,
|
||||
SECONDS_PER_MINUTE,
|
||||
US_PER_MINUTE,
|
||||
US_PER_SECOND,
|
||||
)
|
||||
from narwhals._duckdb.utils import UNITS_DICT, F, fetch_rel_time_zone, lit
|
||||
from narwhals._duration import Interval
|
||||
from narwhals._utils import not_implemented
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprDateTimeNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], DateTimeNamespace["DuckDBExpr"]
|
||||
):
|
||||
def year(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("year", expr))
|
||||
|
||||
def month(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("month", expr))
|
||||
|
||||
def day(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("day", expr))
|
||||
|
||||
def hour(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("hour", expr))
|
||||
|
||||
def minute(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("minute", expr))
|
||||
|
||||
def second(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("second", expr))
|
||||
|
||||
def millisecond(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("millisecond", expr) - F("second", expr) * lit(MS_PER_SECOND)
|
||||
)
|
||||
|
||||
def microsecond(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("microsecond", expr) - F("second", expr) * lit(US_PER_SECOND)
|
||||
)
|
||||
|
||||
def nanosecond(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("nanosecond", expr) - F("second", expr) * lit(NS_PER_SECOND)
|
||||
)
|
||||
|
||||
def to_string(self, format: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("strftime", expr, lit(format))
|
||||
)
|
||||
|
||||
def weekday(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("isodow", expr))
|
||||
|
||||
def ordinal_day(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("dayofyear", expr))
|
||||
|
||||
def date(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: expr.cast("date"))
|
||||
|
||||
def total_minutes(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("datepart", lit("minute"), expr)
|
||||
)
|
||||
|
||||
def total_seconds(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: lit(SECONDS_PER_MINUTE) * F("datepart", lit("minute"), expr)
|
||||
+ F("datepart", lit("second"), expr)
|
||||
)
|
||||
|
||||
def total_milliseconds(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: lit(MS_PER_MINUTE) * F("datepart", lit("minute"), expr)
|
||||
+ F("datepart", lit("millisecond"), expr)
|
||||
)
|
||||
|
||||
def total_microseconds(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: lit(US_PER_MINUTE) * F("datepart", lit("minute"), expr)
|
||||
+ F("datepart", lit("microsecond"), expr)
|
||||
)
|
||||
|
||||
def truncate(self, every: str) -> DuckDBExpr:
|
||||
interval = Interval.parse(every)
|
||||
multiple, unit = interval.multiple, interval.unit
|
||||
if multiple != 1:
|
||||
# https://github.com/duckdb/duckdb/issues/17554
|
||||
msg = f"Only multiple 1 is currently supported for DuckDB.\nGot {multiple!s}."
|
||||
raise ValueError(msg)
|
||||
if unit == "ns":
|
||||
msg = "Truncating to nanoseconds is not yet supported for DuckDB."
|
||||
raise NotImplementedError(msg)
|
||||
format = lit(UNITS_DICT[unit])
|
||||
|
||||
def _truncate(expr: Expression) -> Expression:
|
||||
return F("date_trunc", format, expr)
|
||||
|
||||
return self.compliant._with_elementwise(_truncate)
|
||||
|
||||
def offset_by(self, by: str) -> DuckDBExpr:
|
||||
interval = Interval.parse_no_constraints(by)
|
||||
format = lit(f"{interval.multiple!s} {UNITS_DICT[interval.unit]}")
|
||||
|
||||
def _offset_by(expr: Expression) -> Expression:
|
||||
return F("date_add", format, expr)
|
||||
|
||||
return self.compliant._with_callable(_offset_by)
|
||||
|
||||
def _no_op_time_zone(self, time_zone: str) -> DuckDBExpr:
|
||||
def func(df: DuckDBLazyFrame) -> Sequence[Expression]:
|
||||
native_series_list = self.compliant(df)
|
||||
conn_time_zone = fetch_rel_time_zone(df.native)
|
||||
if conn_time_zone != time_zone:
|
||||
msg = (
|
||||
"DuckDB stores the time zone in the connection, rather than in the "
|
||||
f"data type, so changing the timezone to anything other than {conn_time_zone} "
|
||||
" (the current connection time zone) is not supported."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
return native_series_list
|
||||
|
||||
return self.compliant.__class__(
|
||||
func,
|
||||
evaluate_output_names=self.compliant._evaluate_output_names,
|
||||
alias_output_names=self.compliant._alias_output_names,
|
||||
version=self.compliant._version,
|
||||
)
|
||||
|
||||
def convert_time_zone(self, time_zone: str) -> DuckDBExpr:
|
||||
return self._no_op_time_zone(time_zone)
|
||||
|
||||
def replace_time_zone(self, time_zone: str | None) -> DuckDBExpr:
|
||||
if time_zone is None:
|
||||
return self.compliant._with_elementwise(lambda expr: expr.cast("timestamp"))
|
||||
else:
|
||||
return self._no_op_time_zone(time_zone)
|
||||
|
||||
total_nanoseconds = not_implemented()
|
||||
timestamp = not_implemented()
|
||||
@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import ListNamespace
|
||||
from narwhals._duckdb.utils import F
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprListNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], ListNamespace["DuckDBExpr"]
|
||||
):
|
||||
def len(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("len", expr))
|
||||
128
venv/lib/python3.11/site-packages/narwhals/_duckdb/expr_str.py
Normal file
128
venv/lib/python3.11/site-packages/narwhals/_duckdb/expr_str.py
Normal file
@ -0,0 +1,128 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import StringNamespace
|
||||
from narwhals._duckdb.utils import F, lit, when
|
||||
from narwhals._utils import not_implemented
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from duckdb import Expression
|
||||
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprStringNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], StringNamespace["DuckDBExpr"]
|
||||
):
|
||||
def starts_with(self, prefix: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("starts_with", expr, lit(prefix))
|
||||
)
|
||||
|
||||
def ends_with(self, suffix: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("ends_with", expr, lit(suffix))
|
||||
)
|
||||
|
||||
def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr:
|
||||
def func(expr: Expression) -> Expression:
|
||||
if literal:
|
||||
return F("contains", expr, lit(pattern))
|
||||
return F("regexp_matches", expr, lit(pattern))
|
||||
|
||||
return self.compliant._with_elementwise(func)
|
||||
|
||||
def slice(self, offset: int, length: int | None) -> DuckDBExpr:
|
||||
def func(expr: Expression) -> Expression:
|
||||
offset_lit = lit(offset)
|
||||
return F(
|
||||
"array_slice",
|
||||
expr,
|
||||
lit(offset + 1)
|
||||
if offset >= 0
|
||||
else F("length", expr) + offset_lit + lit(1),
|
||||
F("length", expr) if length is None else lit(length) + offset_lit,
|
||||
)
|
||||
|
||||
return self.compliant._with_elementwise(func)
|
||||
|
||||
def split(self, by: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("str_split", expr, lit(by))
|
||||
)
|
||||
|
||||
def len_chars(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("length", expr))
|
||||
|
||||
def to_lowercase(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("lower", expr))
|
||||
|
||||
def to_uppercase(self) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(lambda expr: F("upper", expr))
|
||||
|
||||
def strip_chars(self, characters: str | None) -> DuckDBExpr:
|
||||
import string
|
||||
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F(
|
||||
"trim", expr, lit(string.whitespace if characters is None else characters)
|
||||
)
|
||||
)
|
||||
|
||||
def replace_all(self, pattern: str, value: str, *, literal: bool) -> DuckDBExpr:
|
||||
if not literal:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("regexp_replace", expr, lit(pattern), lit(value), lit("g"))
|
||||
)
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("replace", expr, lit(pattern), lit(value))
|
||||
)
|
||||
|
||||
def to_datetime(self, format: str | None) -> DuckDBExpr:
|
||||
if format is None:
|
||||
msg = "Cannot infer format with DuckDB backend, please specify `format` explicitly."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("strptime", expr, lit(format))
|
||||
)
|
||||
|
||||
def to_date(self, format: str | None) -> DuckDBExpr:
|
||||
if format is not None:
|
||||
return self.to_datetime(format=format).dt.date()
|
||||
|
||||
compliant_expr = self.compliant
|
||||
return compliant_expr.cast(compliant_expr._version.dtypes.Date())
|
||||
|
||||
def zfill(self, width: int) -> DuckDBExpr:
|
||||
# DuckDB does not have a built-in zfill function, so we need to implement it manually
|
||||
# using string manipulation functions.
|
||||
|
||||
def func(expr: Expression) -> Expression:
|
||||
less_than_width = F("length", expr) < lit(width)
|
||||
zero, hyphen, plus = lit("0"), lit("-"), lit("+")
|
||||
|
||||
starts_with_minus = F("starts_with", expr, hyphen)
|
||||
starts_with_plus = F("starts_with", expr, plus)
|
||||
substring = F("substr", expr, lit(2))
|
||||
padded_substring = F("lpad", substring, lit(width - 1), zero)
|
||||
return (
|
||||
when(
|
||||
starts_with_minus & less_than_width,
|
||||
F("concat", hyphen, padded_substring),
|
||||
)
|
||||
.when(
|
||||
starts_with_plus & less_than_width,
|
||||
F("concat", plus, padded_substring),
|
||||
)
|
||||
.when(less_than_width, F("lpad", expr, lit(width), zero))
|
||||
.otherwise(expr)
|
||||
)
|
||||
|
||||
# can't use `_with_elementwise` due to `when` operator.
|
||||
# TODO(unassigned): implement `window_func` like we do in `Expr.cast`
|
||||
return self.compliant._with_callable(func)
|
||||
|
||||
replace = not_implemented()
|
||||
@ -0,0 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import LazyExprNamespace
|
||||
from narwhals._compliant.any_namespace import StructNamespace
|
||||
from narwhals._duckdb.utils import F, lit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBExprStructNamespace(
|
||||
LazyExprNamespace["DuckDBExpr"], StructNamespace["DuckDBExpr"]
|
||||
):
|
||||
def field(self, name: str) -> DuckDBExpr:
|
||||
return self.compliant._with_elementwise(
|
||||
lambda expr: F("struct_extract", expr, lit(name))
|
||||
).alias(name)
|
||||
@ -0,0 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._sql.group_by import SQLGroupBy
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression # noqa: F401
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
|
||||
class DuckDBGroupBy(SQLGroupBy["DuckDBLazyFrame", "DuckDBExpr", "Expression"]):
|
||||
def __init__(
|
||||
self,
|
||||
df: DuckDBLazyFrame,
|
||||
keys: Sequence[DuckDBExpr] | Sequence[str],
|
||||
/,
|
||||
*,
|
||||
drop_null_keys: bool,
|
||||
) -> None:
|
||||
frame, self._keys, self._output_key_names = self._parse_keys(df, keys=keys)
|
||||
self._compliant_frame = frame.drop_nulls(self._keys) if drop_null_keys else frame
|
||||
|
||||
def agg(self, *exprs: DuckDBExpr) -> DuckDBLazyFrame:
|
||||
agg_columns = list(chain(self._keys, self._evaluate_exprs(exprs)))
|
||||
return self.compliant._with_native(
|
||||
self.compliant.native.aggregate(agg_columns) # type: ignore[arg-type]
|
||||
).rename(dict(zip(self._keys, self._output_key_names)))
|
||||
158
venv/lib/python3.11/site-packages/narwhals/_duckdb/namespace.py
Normal file
158
venv/lib/python3.11/site-packages/narwhals/_duckdb/namespace.py
Normal file
@ -0,0 +1,158 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import operator
|
||||
from functools import reduce
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from duckdb import CoalesceOperator, Expression
|
||||
from duckdb.typing import BIGINT, VARCHAR
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
from narwhals._duckdb.selectors import DuckDBSelectorNamespace
|
||||
from narwhals._duckdb.utils import (
|
||||
DeferredTimeZone,
|
||||
F,
|
||||
concat_str,
|
||||
function,
|
||||
lit,
|
||||
narwhals_to_native_dtype,
|
||||
when,
|
||||
)
|
||||
from narwhals._expression_parsing import (
|
||||
combine_alias_output_names,
|
||||
combine_evaluate_output_names,
|
||||
)
|
||||
from narwhals._sql.namespace import SQLNamespace
|
||||
from narwhals._sql.when_then import SQLThen, SQLWhen
|
||||
from narwhals._utils import Implementation
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from duckdb import DuckDBPyRelation # noqa: F401
|
||||
|
||||
from narwhals._utils import Version
|
||||
from narwhals.typing import ConcatMethod, IntoDType, NonNestedLiteral
|
||||
|
||||
|
||||
class DuckDBNamespace(
|
||||
SQLNamespace[DuckDBLazyFrame, DuckDBExpr, "DuckDBPyRelation", Expression]
|
||||
):
|
||||
_implementation: Implementation = Implementation.DUCKDB
|
||||
|
||||
def __init__(self, *, version: Version) -> None:
|
||||
self._version = version
|
||||
|
||||
@property
|
||||
def selectors(self) -> DuckDBSelectorNamespace:
|
||||
return DuckDBSelectorNamespace.from_namespace(self)
|
||||
|
||||
@property
|
||||
def _expr(self) -> type[DuckDBExpr]:
|
||||
return DuckDBExpr
|
||||
|
||||
@property
|
||||
def _lazyframe(self) -> type[DuckDBLazyFrame]:
|
||||
return DuckDBLazyFrame
|
||||
|
||||
def _function(self, name: str, *args: Expression) -> Expression: # type: ignore[override]
|
||||
return function(name, *args)
|
||||
|
||||
def _lit(self, value: Any) -> Expression:
|
||||
return lit(value)
|
||||
|
||||
def _when(self, condition: Expression, value: Expression) -> Expression:
|
||||
return when(condition, value)
|
||||
|
||||
def _coalesce(self, *exprs: Expression) -> Expression:
|
||||
return CoalesceOperator(*exprs)
|
||||
|
||||
def concat(
|
||||
self, items: Iterable[DuckDBLazyFrame], *, how: ConcatMethod
|
||||
) -> DuckDBLazyFrame:
|
||||
native_items = [item._native_frame for item in items]
|
||||
items = list(items)
|
||||
first = items[0]
|
||||
schema = first.schema
|
||||
if how == "vertical" and not all(x.schema == schema for x in items[1:]):
|
||||
msg = "inputs should all have the same schema"
|
||||
raise TypeError(msg)
|
||||
res = reduce(lambda x, y: x.union(y), native_items)
|
||||
return first._with_native(res)
|
||||
|
||||
def concat_str(
|
||||
self, *exprs: DuckDBExpr, separator: str, ignore_nulls: bool
|
||||
) -> DuckDBExpr:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
cols = list(chain.from_iterable(expr(df) for expr in exprs))
|
||||
if not ignore_nulls:
|
||||
null_mask_result = reduce(operator.or_, (s.isnull() for s in cols))
|
||||
cols_separated = [
|
||||
y
|
||||
for x in [
|
||||
(col.cast(VARCHAR),)
|
||||
if i == len(cols) - 1
|
||||
else (col.cast(VARCHAR), lit(separator))
|
||||
for i, col in enumerate(cols)
|
||||
]
|
||||
for y in x
|
||||
]
|
||||
return [when(~null_mask_result, concat_str(*cols_separated))]
|
||||
else:
|
||||
return [concat_str(*cols, separator=separator)]
|
||||
|
||||
return self._expr(
|
||||
call=func,
|
||||
evaluate_output_names=combine_evaluate_output_names(*exprs),
|
||||
alias_output_names=combine_alias_output_names(*exprs),
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def mean_horizontal(self, *exprs: DuckDBExpr) -> DuckDBExpr:
|
||||
def func(cols: Iterable[Expression]) -> Expression:
|
||||
cols = list(cols)
|
||||
return reduce(
|
||||
operator.add, (CoalesceOperator(col, lit(0)) for col in cols)
|
||||
) / reduce(operator.add, (col.isnotnull().cast(BIGINT) for col in cols))
|
||||
|
||||
return self._expr._from_elementwise_horizontal_op(func, *exprs)
|
||||
|
||||
def when(self, predicate: DuckDBExpr) -> DuckDBWhen:
|
||||
return DuckDBWhen.from_expr(predicate, context=self)
|
||||
|
||||
def lit(self, value: NonNestedLiteral, dtype: IntoDType | None) -> DuckDBExpr:
|
||||
def func(df: DuckDBLazyFrame) -> list[Expression]:
|
||||
tz = DeferredTimeZone(df.native)
|
||||
if dtype is not None:
|
||||
target = narwhals_to_native_dtype(dtype, self._version, tz)
|
||||
return [lit(value).cast(target)] # type: ignore[arg-type]
|
||||
return [lit(value)]
|
||||
|
||||
return self._expr(
|
||||
func,
|
||||
evaluate_output_names=lambda _df: ["literal"],
|
||||
alias_output_names=None,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
def len(self) -> DuckDBExpr:
|
||||
def func(_df: DuckDBLazyFrame) -> list[Expression]:
|
||||
return [F("count")]
|
||||
|
||||
return self._expr(
|
||||
call=func,
|
||||
evaluate_output_names=lambda _df: ["len"],
|
||||
alias_output_names=None,
|
||||
version=self._version,
|
||||
)
|
||||
|
||||
|
||||
class DuckDBWhen(SQLWhen["DuckDBLazyFrame", Expression, DuckDBExpr]):
|
||||
@property
|
||||
def _then(self) -> type[DuckDBThen]:
|
||||
return DuckDBThen
|
||||
|
||||
|
||||
class DuckDBThen(SQLThen["DuckDBLazyFrame", Expression, DuckDBExpr], DuckDBExpr): ...
|
||||
@ -0,0 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._compliant import CompliantSelector, LazySelectorNamespace
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from duckdb import Expression # noqa: F401
|
||||
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame # noqa: F401
|
||||
from narwhals._duckdb.expr import DuckDBWindowFunction
|
||||
|
||||
|
||||
class DuckDBSelectorNamespace(LazySelectorNamespace["DuckDBLazyFrame", "Expression"]):
|
||||
@property
|
||||
def _selector(self) -> type[DuckDBSelector]:
|
||||
return DuckDBSelector
|
||||
|
||||
|
||||
class DuckDBSelector( # type: ignore[misc]
|
||||
CompliantSelector["DuckDBLazyFrame", "Expression"], DuckDBExpr
|
||||
):
|
||||
_window_function: DuckDBWindowFunction | None = None
|
||||
|
||||
def _to_expr(self) -> DuckDBExpr:
|
||||
return DuckDBExpr(
|
||||
self._call,
|
||||
self._window_function,
|
||||
evaluate_output_names=self._evaluate_output_names,
|
||||
alias_output_names=self._alias_output_names,
|
||||
version=self._version,
|
||||
)
|
||||
44
venv/lib/python3.11/site-packages/narwhals/_duckdb/series.py
Normal file
44
venv/lib/python3.11/site-packages/narwhals/_duckdb/series.py
Normal file
@ -0,0 +1,44 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from narwhals._duckdb.utils import DeferredTimeZone, native_to_narwhals_dtype
|
||||
from narwhals.dependencies import get_duckdb
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import ModuleType
|
||||
|
||||
import duckdb
|
||||
from typing_extensions import Never, Self
|
||||
|
||||
from narwhals._utils import Version
|
||||
from narwhals.dtypes import DType
|
||||
|
||||
|
||||
class DuckDBInterchangeSeries:
|
||||
def __init__(self, df: duckdb.DuckDBPyRelation, version: Version) -> None:
|
||||
self._native_series = df
|
||||
self._version = version
|
||||
|
||||
def __narwhals_series__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __native_namespace__(self) -> ModuleType:
|
||||
return get_duckdb() # type: ignore[no-any-return]
|
||||
|
||||
@property
|
||||
def dtype(self) -> DType:
|
||||
return native_to_narwhals_dtype(
|
||||
self._native_series.types[0],
|
||||
self._version,
|
||||
DeferredTimeZone(self._native_series),
|
||||
)
|
||||
|
||||
def __getattr__(self, attr: str) -> Never:
|
||||
msg = ( # pragma: no cover
|
||||
f"Attribute {attr} is not supported for interchange-level dataframes.\n\n"
|
||||
"If you would like to see this kind of object better supported in "
|
||||
"Narwhals, please open a feature request "
|
||||
"at https://github.com/narwhals-dev/narwhals/issues."
|
||||
)
|
||||
raise NotImplementedError(msg) # pragma: no cover
|
||||
18
venv/lib/python3.11/site-packages/narwhals/_duckdb/typing.py
Normal file
18
venv/lib/python3.11/site-packages/narwhals/_duckdb/typing.py
Normal file
@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, TypedDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import Expression
|
||||
|
||||
|
||||
class WindowExpressionKwargs(TypedDict, total=False):
|
||||
partition_by: Sequence[str | Expression]
|
||||
order_by: Sequence[str | Expression]
|
||||
rows_start: int | None
|
||||
rows_end: int | None
|
||||
descending: Sequence[bool]
|
||||
nulls_last: Sequence[bool]
|
||||
ignore_nulls: bool
|
||||
379
venv/lib/python3.11/site-packages/narwhals/_duckdb/utils.py
Normal file
379
venv/lib/python3.11/site-packages/narwhals/_duckdb/utils.py
Normal file
@ -0,0 +1,379 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import duckdb
|
||||
|
||||
from narwhals._utils import Version, isinstance_or_issubclass
|
||||
from narwhals.exceptions import ColumnNotFoundError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from duckdb import DuckDBPyRelation, Expression
|
||||
from duckdb.typing import DuckDBPyType
|
||||
|
||||
from narwhals._compliant.typing import CompliantLazyFrameAny
|
||||
from narwhals._duckdb.dataframe import DuckDBLazyFrame
|
||||
from narwhals._duckdb.expr import DuckDBExpr
|
||||
from narwhals.dtypes import DType
|
||||
from narwhals.typing import IntoDType
|
||||
|
||||
|
||||
UNITS_DICT = {
|
||||
"y": "year",
|
||||
"q": "quarter",
|
||||
"mo": "month",
|
||||
"d": "day",
|
||||
"h": "hour",
|
||||
"m": "minute",
|
||||
"s": "second",
|
||||
"ms": "millisecond",
|
||||
"us": "microsecond",
|
||||
"ns": "nanosecond",
|
||||
}
|
||||
UNIT_TO_TIMESTAMPS = {
|
||||
"s": "TIMESTAMP_S",
|
||||
"ms": "TIMESTAMP_MS",
|
||||
"us": "TIMESTAMP",
|
||||
"ns": "TIMESTAMP_NS",
|
||||
}
|
||||
DESCENDING_TO_ORDER = {True: "desc", False: "asc"}
|
||||
NULLS_LAST_TO_NULLS_POS = {True: "nulls last", False: "nulls first"}
|
||||
|
||||
col = duckdb.ColumnExpression
|
||||
"""Alias for `duckdb.ColumnExpression`."""
|
||||
|
||||
lit = duckdb.ConstantExpression
|
||||
"""Alias for `duckdb.ConstantExpression`."""
|
||||
|
||||
when = duckdb.CaseExpression
|
||||
"""Alias for `duckdb.CaseExpression`."""
|
||||
|
||||
F = duckdb.FunctionExpression
|
||||
"""Alias for `duckdb.FunctionExpression`."""
|
||||
|
||||
|
||||
def concat_str(*exprs: Expression, separator: str = "") -> Expression:
|
||||
"""Concatenate many strings, NULL inputs are skipped.
|
||||
|
||||
Wraps [concat] and [concat_ws] `FunctionExpression`(s).
|
||||
|
||||
Arguments:
|
||||
exprs: Native columns.
|
||||
separator: String that will be used to separate the values of each column.
|
||||
|
||||
Returns:
|
||||
A new native expression.
|
||||
|
||||
[concat]: https://duckdb.org/docs/stable/sql/functions/char.html#concatstring-
|
||||
[concat_ws]: https://duckdb.org/docs/stable/sql/functions/char.html#concat_wsseparator-string-
|
||||
"""
|
||||
return F("concat_ws", lit(separator), *exprs) if separator else F("concat", *exprs)
|
||||
|
||||
|
||||
def evaluate_exprs(
|
||||
df: DuckDBLazyFrame, /, *exprs: DuckDBExpr
|
||||
) -> list[tuple[str, Expression]]:
|
||||
native_results: list[tuple[str, Expression]] = []
|
||||
for expr in exprs:
|
||||
native_series_list = expr._call(df)
|
||||
output_names = expr._evaluate_output_names(df)
|
||||
if expr._alias_output_names is not None:
|
||||
output_names = expr._alias_output_names(output_names)
|
||||
if len(output_names) != len(native_series_list): # pragma: no cover
|
||||
msg = f"Internal error: got output names {output_names}, but only got {len(native_series_list)} results"
|
||||
raise AssertionError(msg)
|
||||
native_results.extend(zip(output_names, native_series_list))
|
||||
return native_results
|
||||
|
||||
|
||||
class DeferredTimeZone:
|
||||
"""Object which gets passed between `native_to_narwhals_dtype` calls.
|
||||
|
||||
DuckDB stores the time zone in the connection, rather than in the dtypes, so
|
||||
this ensures that when calculating the schema of a dataframe with multiple
|
||||
timezone-aware columns, that the connection's time zone is only fetched once.
|
||||
|
||||
Note: we cannot make the time zone a cached `DuckDBLazyFrame` property because
|
||||
the time zone can be modified after `DuckDBLazyFrame` creation:
|
||||
|
||||
```python
|
||||
df = nw.from_native(rel)
|
||||
print(df.collect_schema())
|
||||
rel.query("set timezone = 'Asia/Kolkata'")
|
||||
print(df.collect_schema()) # should change to reflect new time zone
|
||||
```
|
||||
"""
|
||||
|
||||
_cached_time_zone: str | None = None
|
||||
|
||||
def __init__(self, rel: DuckDBPyRelation) -> None:
|
||||
self._rel = rel
|
||||
|
||||
@property
|
||||
def time_zone(self) -> str:
|
||||
"""Fetch relation time zone (if it wasn't calculated already)."""
|
||||
if self._cached_time_zone is None:
|
||||
self._cached_time_zone = fetch_rel_time_zone(self._rel)
|
||||
return self._cached_time_zone
|
||||
|
||||
|
||||
def native_to_narwhals_dtype(
|
||||
duckdb_dtype: DuckDBPyType, version: Version, deferred_time_zone: DeferredTimeZone
|
||||
) -> DType:
|
||||
duckdb_dtype_id = duckdb_dtype.id
|
||||
dtypes = version.dtypes
|
||||
|
||||
# Handle nested data types first
|
||||
if duckdb_dtype_id == "list":
|
||||
return dtypes.List(
|
||||
native_to_narwhals_dtype(duckdb_dtype.child, version, deferred_time_zone)
|
||||
)
|
||||
|
||||
if duckdb_dtype_id == "struct":
|
||||
children = duckdb_dtype.children
|
||||
return dtypes.Struct(
|
||||
[
|
||||
dtypes.Field(
|
||||
name=child[0],
|
||||
dtype=native_to_narwhals_dtype(child[1], version, deferred_time_zone),
|
||||
)
|
||||
for child in children
|
||||
]
|
||||
)
|
||||
|
||||
if duckdb_dtype_id == "array":
|
||||
child, size = duckdb_dtype.children
|
||||
shape: list[int] = [size[1]]
|
||||
|
||||
while child[1].id == "array":
|
||||
child, size = child[1].children
|
||||
shape.insert(0, size[1])
|
||||
|
||||
inner = native_to_narwhals_dtype(child[1], version, deferred_time_zone)
|
||||
return dtypes.Array(inner=inner, shape=tuple(shape))
|
||||
|
||||
if duckdb_dtype_id == "enum":
|
||||
if version is Version.V1:
|
||||
return dtypes.Enum() # type: ignore[call-arg]
|
||||
categories = duckdb_dtype.children[0][1]
|
||||
return dtypes.Enum(categories=categories)
|
||||
|
||||
if duckdb_dtype_id == "timestamp with time zone":
|
||||
return dtypes.Datetime(time_zone=deferred_time_zone.time_zone)
|
||||
|
||||
return _non_nested_native_to_narwhals_dtype(duckdb_dtype_id, version)
|
||||
|
||||
|
||||
def fetch_rel_time_zone(rel: duckdb.DuckDBPyRelation) -> str:
|
||||
result = rel.query(
|
||||
"duckdb_settings()", "select value from duckdb_settings() where name = 'TimeZone'"
|
||||
).fetchone()
|
||||
assert result is not None # noqa: S101
|
||||
return result[0] # type: ignore[no-any-return]
|
||||
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def _non_nested_native_to_narwhals_dtype(duckdb_dtype_id: str, version: Version) -> DType:
|
||||
dtypes = version.dtypes
|
||||
return {
|
||||
"hugeint": dtypes.Int128(),
|
||||
"bigint": dtypes.Int64(),
|
||||
"integer": dtypes.Int32(),
|
||||
"smallint": dtypes.Int16(),
|
||||
"tinyint": dtypes.Int8(),
|
||||
"uhugeint": dtypes.UInt128(),
|
||||
"ubigint": dtypes.UInt64(),
|
||||
"uinteger": dtypes.UInt32(),
|
||||
"usmallint": dtypes.UInt16(),
|
||||
"utinyint": dtypes.UInt8(),
|
||||
"double": dtypes.Float64(),
|
||||
"float": dtypes.Float32(),
|
||||
"varchar": dtypes.String(),
|
||||
"date": dtypes.Date(),
|
||||
"timestamp_s": dtypes.Datetime("s"),
|
||||
"timestamp_ms": dtypes.Datetime("ms"),
|
||||
"timestamp": dtypes.Datetime(),
|
||||
"timestamp_ns": dtypes.Datetime("ns"),
|
||||
"boolean": dtypes.Boolean(),
|
||||
"interval": dtypes.Duration(),
|
||||
"decimal": dtypes.Decimal(),
|
||||
"time": dtypes.Time(),
|
||||
"blob": dtypes.Binary(),
|
||||
}.get(duckdb_dtype_id, dtypes.Unknown())
|
||||
|
||||
|
||||
def narwhals_to_native_dtype( # noqa: PLR0912,PLR0915,C901
|
||||
dtype: IntoDType, version: Version, deferred_time_zone: DeferredTimeZone
|
||||
) -> str:
|
||||
dtypes = version.dtypes
|
||||
if isinstance_or_issubclass(dtype, dtypes.Decimal):
|
||||
msg = "Casting to Decimal is not supported yet."
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance_or_issubclass(dtype, dtypes.Float64):
|
||||
return "DOUBLE"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Float32):
|
||||
return "FLOAT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int128):
|
||||
return "INT128"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int64):
|
||||
return "BIGINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int32):
|
||||
return "INTEGER"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int16):
|
||||
return "SMALLINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Int8):
|
||||
return "TINYINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt128):
|
||||
return "UINT128"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt64):
|
||||
return "UBIGINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt32):
|
||||
return "UINTEGER"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt16): # pragma: no cover
|
||||
return "USMALLINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.UInt8): # pragma: no cover
|
||||
return "UTINYINT"
|
||||
if isinstance_or_issubclass(dtype, dtypes.String):
|
||||
return "VARCHAR"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Boolean): # pragma: no cover
|
||||
return "BOOLEAN"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Time):
|
||||
return "TIME"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Binary):
|
||||
return "BLOB"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Categorical):
|
||||
msg = "Categorical not supported by DuckDB"
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance_or_issubclass(dtype, dtypes.Enum):
|
||||
if version is Version.V1:
|
||||
msg = "Converting to Enum is not supported in narwhals.stable.v1"
|
||||
raise NotImplementedError(msg)
|
||||
if isinstance(dtype, dtypes.Enum):
|
||||
categories = "'" + "', '".join(dtype.categories) + "'"
|
||||
return f"ENUM ({categories})"
|
||||
msg = "Can not cast / initialize Enum without categories present"
|
||||
raise ValueError(msg)
|
||||
|
||||
if isinstance_or_issubclass(dtype, dtypes.Datetime):
|
||||
tu = dtype.time_unit
|
||||
tz = dtype.time_zone
|
||||
if not tz:
|
||||
return UNIT_TO_TIMESTAMPS[tu]
|
||||
if tu != "us":
|
||||
msg = f"Only microsecond precision is supported for timezone-aware `Datetime` in DuckDB, got {tu} precision"
|
||||
raise ValueError(msg)
|
||||
if tz != (rel_tz := deferred_time_zone.time_zone): # pragma: no cover
|
||||
msg = f"Only the connection time zone {rel_tz} is supported, got: {tz}."
|
||||
raise ValueError(msg)
|
||||
# TODO(unassigned): cover once https://github.com/narwhals-dev/narwhals/issues/2742 addressed
|
||||
return "TIMESTAMPTZ" # pragma: no cover
|
||||
if isinstance_or_issubclass(dtype, dtypes.Duration):
|
||||
if (tu := dtype.time_unit) != "us": # pragma: no cover
|
||||
msg = f"Only microsecond-precision Duration is supported, got {tu} precision"
|
||||
return "INTERVAL"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Date):
|
||||
return "DATE"
|
||||
if isinstance_or_issubclass(dtype, dtypes.List):
|
||||
inner = narwhals_to_native_dtype(dtype.inner, version, deferred_time_zone)
|
||||
return f"{inner}[]"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Struct):
|
||||
inner = ", ".join(
|
||||
f'"{field.name}" {narwhals_to_native_dtype(field.dtype, version, deferred_time_zone)}'
|
||||
for field in dtype.fields
|
||||
)
|
||||
return f"STRUCT({inner})"
|
||||
if isinstance_or_issubclass(dtype, dtypes.Array):
|
||||
shape = dtype.shape
|
||||
duckdb_shape_fmt = "".join(f"[{item}]" for item in shape)
|
||||
inner_dtype: Any = dtype
|
||||
for _ in shape:
|
||||
inner_dtype = inner_dtype.inner
|
||||
duckdb_inner = narwhals_to_native_dtype(inner_dtype, version, deferred_time_zone)
|
||||
return f"{duckdb_inner}{duckdb_shape_fmt}"
|
||||
msg = f"Unknown dtype: {dtype}" # pragma: no cover
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def parse_into_expression(into_expression: str | Expression) -> Expression:
|
||||
return col(into_expression) if isinstance(into_expression, str) else into_expression
|
||||
|
||||
|
||||
def generate_partition_by_sql(*partition_by: str | Expression) -> str:
|
||||
if not partition_by:
|
||||
return ""
|
||||
by_sql = ", ".join([f"{parse_into_expression(x)}" for x in partition_by])
|
||||
return f"partition by {by_sql}"
|
||||
|
||||
|
||||
def generate_order_by_sql(
|
||||
*order_by: str | Expression, descending: Sequence[bool], nulls_last: Sequence[bool]
|
||||
) -> str:
|
||||
if not order_by:
|
||||
return ""
|
||||
by_sql = ",".join(
|
||||
f"{parse_into_expression(x)} {DESCENDING_TO_ORDER[_descending]} {NULLS_LAST_TO_NULLS_POS[_nulls_last]}"
|
||||
for x, _descending, _nulls_last in zip(order_by, descending, nulls_last)
|
||||
)
|
||||
return f"order by {by_sql}"
|
||||
|
||||
|
||||
def window_expression(
|
||||
expr: Expression,
|
||||
partition_by: Sequence[str | Expression] = (),
|
||||
order_by: Sequence[str | Expression] = (),
|
||||
rows_start: int | None = None,
|
||||
rows_end: int | None = None,
|
||||
*,
|
||||
descending: Sequence[bool] | None = None,
|
||||
nulls_last: Sequence[bool] | None = None,
|
||||
ignore_nulls: bool = False,
|
||||
) -> Expression:
|
||||
# TODO(unassigned): Replace with `duckdb.WindowExpression` when they release it.
|
||||
# https://github.com/duckdb/duckdb/discussions/14725#discussioncomment-11200348
|
||||
try:
|
||||
from duckdb import SQLExpression
|
||||
except ModuleNotFoundError as exc: # pragma: no cover
|
||||
msg = f"DuckDB>=1.3.0 is required for this operation. Found: DuckDB {duckdb.__version__}"
|
||||
raise NotImplementedError(msg) from exc
|
||||
pb = generate_partition_by_sql(*partition_by)
|
||||
descending = descending or [False] * len(order_by)
|
||||
nulls_last = nulls_last or [False] * len(order_by)
|
||||
ob = generate_order_by_sql(*order_by, descending=descending, nulls_last=nulls_last)
|
||||
|
||||
if rows_start is not None and rows_end is not None:
|
||||
rows = f"rows between {-rows_start} preceding and {rows_end} following"
|
||||
elif rows_end is not None:
|
||||
rows = f"rows between unbounded preceding and {rows_end} following"
|
||||
elif rows_start is not None:
|
||||
rows = f"rows between {-rows_start} preceding and unbounded following"
|
||||
else:
|
||||
rows = ""
|
||||
|
||||
func = f"{str(expr).removesuffix(')')} ignore nulls)" if ignore_nulls else str(expr)
|
||||
return SQLExpression(f"{func} over ({pb} {ob} {rows})")
|
||||
|
||||
|
||||
def catch_duckdb_exception(
|
||||
exception: Exception, frame: CompliantLazyFrameAny, /
|
||||
) -> ColumnNotFoundError | Exception:
|
||||
if isinstance(exception, duckdb.BinderException) and any(
|
||||
msg in str(exception)
|
||||
for msg in (
|
||||
"not found in FROM clause",
|
||||
"this column cannot be referenced before it is defined",
|
||||
)
|
||||
):
|
||||
return ColumnNotFoundError.from_available_column_names(
|
||||
available_columns=frame.columns
|
||||
)
|
||||
# Just return exception as-is.
|
||||
return exception
|
||||
|
||||
|
||||
def function(name: str, *args: Expression) -> Expression:
|
||||
if name == "isnull":
|
||||
return args[0].isnull()
|
||||
return F(name, *args)
|
||||
Reference in New Issue
Block a user