some new features

2025-07-30 18:53:50 +03:00
parent 8019bd3b7c
commit 079804a0fc
2118 changed files with 297840 additions and 502 deletions
--- a/.venv/lib/python3.12/site-packages/stanio/init.py
+++ b/.venv/lib/python3.12/site-packages/stanio/init.py
@ -0,0 +1,14 @@
+from .csv import read_csv
+from .json import dump_stan_json, write_stan_json
+from .reshape import Variable, parse_header, stan_variables
+
+__all__ = [
+    "read_csv",
+    "write_stan_json",
+    "dump_stan_json",
+    "Variable",
+    "parse_header",
+    "stan_variables",
+]
+
+__version__ = "0.5.1"
--- a/.venv/lib/python3.12/site-packages/stanio/pycache/init.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/stanio/pycache/init.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/stanio/pycache/csv.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/stanio/pycache/csv.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/stanio/pycache/json.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/stanio/pycache/json.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/stanio/pycache/reshape.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/stanio/pycache/reshape.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/stanio/csv.py
+++ b/.venv/lib/python3.12/site-packages/stanio/csv.py
@ -0,0 +1,50 @@
+"""
+Module to load the minimal information from a Stan CSV file.
+Only the header row and data are read, no metadata is parsed.
+"""
+from typing import List, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+
+
+def read_csv(filenames: Union[str, List[str]]) -> Tuple[str, npt.NDArray[np.float64]]:
+    """
+    Reads CSV files like those produced by Stan, returning the header and data.
+
+    If multiple files are given, the data is stacked along the first axis,
+    so in typical usage, the shape of the returned data will be
+    ``(n_chains, n_samples, n_params)``.
+
+    Parameters
+    ----------
+    filenames : Union[str, List[str]]
+        Path to the CSV file(s) to read.
+
+    Returns
+    -------
+    Tuple[str, npt.NDArray[np.float64]]
+        The header row and data from the CSV file(s).
+
+    Raises
+    ------
+    ValueError
+        If multiple files are given and the headers do not match between them.
+    """
+
+    if not isinstance(filenames, list):
+        filenames = [filenames]
+
+    header = ""
+    data: List[npt.NDArray[np.float64]] = [None for _ in range(len(filenames))]  # type: ignore
+    for i, f in enumerate(filenames):
+        with open(f, "r") as fd:
+            while (file_header := fd.readline()).startswith("#"):
+                pass
+            if header == "":
+                header = file_header
+            elif header != file_header:
+                raise ValueError("Headers do not match")
+            data[i] = np.loadtxt(fd, delimiter=",", comments="#")
+
+    return header.strip(), np.stack(data, axis=0)
--- a/.venv/lib/python3.12/site-packages/stanio/json.py
+++ b/.venv/lib/python3.12/site-packages/stanio/json.py
@ -0,0 +1,102 @@
+"""
+Utilities for writing Stan Json files
+"""
+try:
+    import ujson as json
+
+    uj_version = tuple(map(int, json.__version__.split(".")))
+    if uj_version < (5, 5, 0):
+        raise ImportError("ujson version too old")  # pragma: no cover
+    UJSON_AVAILABLE = True
+except:
+    UJSON_AVAILABLE = False
+    import json
+
+from typing import Any, Mapping
+
+import numpy as np
+
+
+def process_dictionary(d: Mapping[str, Any]) -> Mapping[str, Any]:
+    return {k: process_value(v) for k, v in d.items()}
+
+
+# pylint: disable=too-many-return-statements
+def process_value(val: Any) -> Any:
+    if val is None:
+        return None
+    if isinstance(val, bool):  # stan uses 0, 1
+        return int(val)
+    if isinstance(val, complex):  # treat as 2-long array
+        return [val.real, val.imag]
+    if isinstance(val, dict):  # if a tuple was manually specified
+        return process_dictionary(val)
+    if isinstance(val, tuple):  # otherwise, turn a tuple into a dict
+        return dict(zip(range(1, len(val) + 1), map(process_value, val)))
+    if isinstance(val, list):
+        return [process_value(i) for i in val]
+    original_module = getattr(type(val), "__module__", "")
+    if (
+        "numpy" in original_module
+        or "xarray" in original_module
+        or "pandas" in original_module
+    ):
+        numpy_val = np.asanyarray(val)
+        # fast paths for numeric types
+        if numpy_val.dtype.kind in "iuf":
+            return numpy_val.tolist()
+        if numpy_val.dtype.kind == "c":
+            return np.stack([np.asarray(numpy_val.real), np.asarray(numpy_val.imag)], axis=-1).tolist()
+        if numpy_val.dtype.kind == "b":
+            return numpy_val.astype(int).tolist()
+
+        # should only be object arrays (tuples, etc)
+        return process_value(numpy_val.tolist())
+
+    return val
+
+
+def dump_stan_json(data: Mapping[str, Any]) -> str:
+    """
+    Convert a mapping of strings to data to a JSON string.
+
+    Values can be any numeric type, a boolean (converted to int),
+    or any collection compatible with :func:`numpy.asarray`, e.g a
+    :class:`pandas.Series`.
+
+    Produces a string compatible with the
+    `Json Format for Cmdstan
+    <https://mc-stan.org/docs/cmdstan-guide/json.html>`__
+
+    :param data: A mapping from strings to values. This can be a dictionary
+        or something more exotic like an :class:`xarray.Dataset`. This will be
+        copied before type conversion, not modified
+    """
+    return json.dumps(process_dictionary(data))
+
+
+def write_stan_json(path: str, data: Mapping[str, Any]) -> None:
+    """
+    Dump a mapping of strings to data to a JSON file.
+
+    Values can be any numeric type, a boolean (converted to int),
+    or any collection compatible with :func:`numpy.asarray`, e.g a
+    :class:`pandas.Series`.
+
+    Produces a file compatible with the
+    `Json Format for Cmdstan
+    <https://mc-stan.org/docs/cmdstan-guide/json.html>`__
+
+    :param path: File path for the created json. Will be overwritten if
+        already in existence.
+
+    :param data: A mapping from strings to values. This can be a dictionary
+        or something more exotic like an :class:`xarray.Dataset`. This will be
+        copied before type conversion, not modified
+    """
+    with open(path, "w") as fd:
+        if UJSON_AVAILABLE:
+            json.dump(process_dictionary(data), fd)
+        else:
+            for chunk in json.JSONEncoder().iterencode(process_dictionary(data)):
+                fd.write(chunk)
--- a/.venv/lib/python3.12/site-packages/stanio/py.typed
+++ b/.venv/lib/python3.12/site-packages/stanio/py.typed
--- a/.venv/lib/python3.12/site-packages/stanio/reshape.py
+++ b/.venv/lib/python3.12/site-packages/stanio/reshape.py
@ -0,0 +1,252 @@
+"""
+Classes and functions for reshaping Stan output.
+
+Especially with the addition of tuples, Stan writes
+flat arrays of data with a rich internal structure.
+"""
+from dataclasses import dataclass
+from enum import Enum
+from math import prod
+from typing import Any, Dict, Iterable, List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+
+class VariableType(Enum):
+    SCALAR = 1  # real or integer
+    COMPLEX = 2  # complex number - requires striding
+    TUPLE = 3  # tuples - require recursive handling
+
+
+@dataclass
+class Variable:
+    """
+    This class represents a single output variable of a Stan model.
+
+    It contains information about the name, dimensions, and type of the
+    variable, as well as the indices of where that variable is located in
+    the flattened output array Stan models write.
+
+    Generally, this class should not be instantiated directly, but rather
+    created by the :func:`parse_header()` function.
+    """
+
+    # name of the parameter as given in stan. For nested parameters, this is a dummy name
+    name: str
+    # where to start (resp. end) reading from the flattened array.
+    # For arrays with nested parameters, this will be for the first element
+    # and is relative to the start of the parent
+    start_idx: int
+    end_idx: int
+    # rectangular dimensions of the parameter (e.g. (2, 3) for a 2x3 matrix)
+    # For nested parameters, this will be the dimensions of the outermost array.
+    dimensions: Tuple[int, ...]
+    # type of the parameter
+    type: VariableType
+    # list of nested parameters
+    contents: List["Variable"]
+
+    def dtype(self, top: bool = True) -> np.dtype:
+        if self.type == VariableType.TUPLE:
+            elts = [
+                (str(i + 1), param.dtype(top=False))
+                for i, param in enumerate(self.contents)
+            ]
+            dtype = np.dtype(elts)
+        elif self.type == VariableType.SCALAR:
+            dtype = np.float64
+        elif self.type == VariableType.COMPLEX:
+            dtype = np.complex128
+
+        if top:
+            return dtype
+        else:
+            return np.dtype((dtype, self.dimensions))
+
+    def columns(self) -> Iterable[int]:
+        return range(self.start_idx, self.end_idx)
+
+    def num_elts(self) -> int:
+        return prod(self.dimensions)
+
+    def elt_size(self) -> int:
+        return self.end_idx - self.start_idx
+
+    # total size is elt_size * num_elts
+
+    def _extract_helper(self, src: np.ndarray, offset: int = 0) -> np.ndarray:
+        start = self.start_idx + offset
+        end = self.end_idx + offset
+        if self.type == VariableType.SCALAR:
+            return src[..., start:end].reshape(-1, *self.dimensions, order="F")
+        elif self.type == VariableType.COMPLEX:
+            ret = src[..., start:end].reshape(-1, 2, *self.dimensions, order="F")
+            ret = ret[:, ::2] + 1j * ret[:, 1::2]
+            return ret.squeeze().reshape(-1, *self.dimensions, order="F")
+        elif self.type == VariableType.TUPLE:
+            out: np.ndarray = np.empty(
+                (prod(src.shape[:-1]), prod(self.dimensions)), dtype=object
+            )
+            for idx in range(self.num_elts()):
+                off = idx * self.elt_size() // self.num_elts()
+                elts = [
+                    param._extract_helper(src, offset=start + off)
+                    for param in self.contents
+                ]
+                for i in range(elts[0].shape[0]):
+                    out[i, idx] = tuple(elt[i] for elt in elts)
+            return out.reshape(-1, *self.dimensions, order="F")
+
+    def extract_reshape(self, src: np.ndarray, object: bool = True) -> npt.NDArray[Any]:
+        """
+        Given an array where the final dimension is the flattened output of a
+        Stan model, (e.g. one row of a Stan CSV file), extract the variable
+        and reshape it to the correct type and dimensions.
+
+        This will most likely result in copies of the data being made if
+        the variable is not a scalar.
+
+        Parameters
+        ----------
+        src : np.ndarray
+            The array to extract from.
+
+            Indicies besides the final dimension are preserved
+            in the output.
+
+        object : bool
+            If True, the output of tuple types will be an object array,
+            otherwise it will use custom dtypes to represent tuples.
+
+        Returns
+        -------
+        npt.NDArray[Any]
+            The extracted variable, reshaped to the correct dimensions.
+            If the variable is a tuple, this will be an object array,
+            otherwise it will have a dtype of either float64 or complex128.
+        """
+        out = self._extract_helper(src)
+        if not object:
+            out = out.astype(self.dtype())
+        if src.ndim > 1:
+            out = out.reshape(*src.shape[:-1], *self.dimensions, order="F")
+        else:
+            out = out.squeeze(axis=0)
+
+        return out
+
+
+def _munge_first_tuple(tup: str) -> str:
+    return "dummy_" + tup.split(":", 1)[1]
+
+
+def _get_base_name(param: str) -> str:
+    return param.split(".")[0].split(":")[0]
+
+
+def _from_header(header: str) -> List[Variable]:
+    # appending __dummy ensures one extra iteration in the later loop
+    header = header.strip() + ",__dummy"
+    entries = header.split(",")
+    params = []
+    start_idx = 0
+    name = _get_base_name(entries[0])
+    for i in range(0, len(entries) - 1):
+        entry = entries[i]
+        next_name = _get_base_name(entries[i + 1])
+
+        if next_name != name:
+            if ":" not in entry:
+                dims = entry.split(".")[1:]
+                if ".real" in entry or ".imag" in entry:
+                    type = VariableType.COMPLEX
+                    dims = dims[:-1]
+                else:
+                    type = VariableType.SCALAR
+                params.append(
+                    Variable(
+                        name=name,
+                        start_idx=start_idx,
+                        end_idx=i + 1,
+                        dimensions=tuple(map(int, dims)),
+                        type=type,
+                        contents=[],
+                    )
+                )
+            else:
+                dims = entry.split(":")[0].split(".")[1:]
+                munged_header = ",".join(
+                    dict.fromkeys(map(_munge_first_tuple, entries[start_idx : i + 1]))
+                )
+
+                params.append(
+                    Variable(
+                        name=name,
+                        start_idx=start_idx,
+                        end_idx=i + 1,
+                        dimensions=tuple(map(int, dims)),
+                        type=VariableType.TUPLE,
+                        contents=_from_header(munged_header),
+                    )
+                )
+
+            start_idx = i + 1
+            name = next_name
+
+    return params
+
+
+def parse_header(header: str) -> Dict[str, Variable]:
+    """
+    Given a comma-separated list of names of Stan outputs, like
+    that from the header row of a CSV file, parse it into a dictionary of
+    :class:`Variable` objects.
+
+    Parameters
+    ----------
+    header : str
+        Comma separated list of Stan variables, including index information.
+        For example, an ``array[2] real foo` would be represented as
+        ``foo.1,foo.2``.
+
+    Returns
+    -------
+    Dict[str, Variable]
+        A dictionary mapping the base name of each variable to a :class:`Variable`.
+    """
+    return {param.name: param for param in _from_header(header)}
+
+
+def stan_variables(
+    parameters: Dict[str, Variable],
+    source: npt.NDArray[np.float64],
+    *,
+    object: bool = True,
+) -> Dict[str, npt.NDArray[Any]]:
+    """
+    Given a dictionary of :class:`Variable` objects and a source array,
+    extract the variables from the source array and reshape them to the
+    correct dimensions.
+
+    Parameters
+    ----------
+    parameters : Dict[str, Variable]
+        A dictionary of :class:`Variable` objects,
+        like that returned by :func:`parse_header()`.
+    source : npt.NDArray[np.float64]
+        The array to extract from.
+    object : bool
+        If True, the output of tuple types will be an object array,
+        otherwise it will use custom dtypes to represent tuples.
+
+    Returns
+    -------
+    Dict[str, npt.NDArray[Any]]
+        A dictionary mapping the base name of each variable to the extracted
+        and reshaped data.
+    """
+    return {
+        param.name: param.extract_reshape(source, object=object)
+        for param in parameters.values()
+    }