some new features

This commit is contained in:
ilgazca
2025-07-30 18:53:50 +03:00
parent 8019bd3b7c
commit 079804a0fc
2118 changed files with 297840 additions and 502 deletions

View File

@ -0,0 +1,14 @@
from .csv import read_csv
from .json import dump_stan_json, write_stan_json
from .reshape import Variable, parse_header, stan_variables
__all__ = [
"read_csv",
"write_stan_json",
"dump_stan_json",
"Variable",
"parse_header",
"stan_variables",
]
__version__ = "0.5.1"

View File

@ -0,0 +1,50 @@
"""
Module to load the minimal information from a Stan CSV file.
Only the header row and data are read, no metadata is parsed.
"""
from typing import List, Tuple, Union
import numpy as np
import numpy.typing as npt
def read_csv(filenames: Union[str, List[str]]) -> Tuple[str, npt.NDArray[np.float64]]:
"""
Reads CSV files like those produced by Stan, returning the header and data.
If multiple files are given, the data is stacked along the first axis,
so in typical usage, the shape of the returned data will be
``(n_chains, n_samples, n_params)``.
Parameters
----------
filenames : Union[str, List[str]]
Path to the CSV file(s) to read.
Returns
-------
Tuple[str, npt.NDArray[np.float64]]
The header row and data from the CSV file(s).
Raises
------
ValueError
If multiple files are given and the headers do not match between them.
"""
if not isinstance(filenames, list):
filenames = [filenames]
header = ""
data: List[npt.NDArray[np.float64]] = [None for _ in range(len(filenames))] # type: ignore
for i, f in enumerate(filenames):
with open(f, "r") as fd:
while (file_header := fd.readline()).startswith("#"):
pass
if header == "":
header = file_header
elif header != file_header:
raise ValueError("Headers do not match")
data[i] = np.loadtxt(fd, delimiter=",", comments="#")
return header.strip(), np.stack(data, axis=0)

View File

@ -0,0 +1,102 @@
"""
Utilities for writing Stan Json files
"""
try:
import ujson as json
uj_version = tuple(map(int, json.__version__.split(".")))
if uj_version < (5, 5, 0):
raise ImportError("ujson version too old") # pragma: no cover
UJSON_AVAILABLE = True
except:
UJSON_AVAILABLE = False
import json
from typing import Any, Mapping
import numpy as np
def process_dictionary(d: Mapping[str, Any]) -> Mapping[str, Any]:
return {k: process_value(v) for k, v in d.items()}
# pylint: disable=too-many-return-statements
def process_value(val: Any) -> Any:
if val is None:
return None
if isinstance(val, bool): # stan uses 0, 1
return int(val)
if isinstance(val, complex): # treat as 2-long array
return [val.real, val.imag]
if isinstance(val, dict): # if a tuple was manually specified
return process_dictionary(val)
if isinstance(val, tuple): # otherwise, turn a tuple into a dict
return dict(zip(range(1, len(val) + 1), map(process_value, val)))
if isinstance(val, list):
return [process_value(i) for i in val]
original_module = getattr(type(val), "__module__", "")
if (
"numpy" in original_module
or "xarray" in original_module
or "pandas" in original_module
):
numpy_val = np.asanyarray(val)
# fast paths for numeric types
if numpy_val.dtype.kind in "iuf":
return numpy_val.tolist()
if numpy_val.dtype.kind == "c":
return np.stack([np.asarray(numpy_val.real), np.asarray(numpy_val.imag)], axis=-1).tolist()
if numpy_val.dtype.kind == "b":
return numpy_val.astype(int).tolist()
# should only be object arrays (tuples, etc)
return process_value(numpy_val.tolist())
return val
def dump_stan_json(data: Mapping[str, Any]) -> str:
"""
Convert a mapping of strings to data to a JSON string.
Values can be any numeric type, a boolean (converted to int),
or any collection compatible with :func:`numpy.asarray`, e.g a
:class:`pandas.Series`.
Produces a string compatible with the
`Json Format for Cmdstan
<https://mc-stan.org/docs/cmdstan-guide/json.html>`__
:param data: A mapping from strings to values. This can be a dictionary
or something more exotic like an :class:`xarray.Dataset`. This will be
copied before type conversion, not modified
"""
return json.dumps(process_dictionary(data))
def write_stan_json(path: str, data: Mapping[str, Any]) -> None:
"""
Dump a mapping of strings to data to a JSON file.
Values can be any numeric type, a boolean (converted to int),
or any collection compatible with :func:`numpy.asarray`, e.g a
:class:`pandas.Series`.
Produces a file compatible with the
`Json Format for Cmdstan
<https://mc-stan.org/docs/cmdstan-guide/json.html>`__
:param path: File path for the created json. Will be overwritten if
already in existence.
:param data: A mapping from strings to values. This can be a dictionary
or something more exotic like an :class:`xarray.Dataset`. This will be
copied before type conversion, not modified
"""
with open(path, "w") as fd:
if UJSON_AVAILABLE:
json.dump(process_dictionary(data), fd)
else:
for chunk in json.JSONEncoder().iterencode(process_dictionary(data)):
fd.write(chunk)

View File

@ -0,0 +1,252 @@
"""
Classes and functions for reshaping Stan output.
Especially with the addition of tuples, Stan writes
flat arrays of data with a rich internal structure.
"""
from dataclasses import dataclass
from enum import Enum
from math import prod
from typing import Any, Dict, Iterable, List, Tuple
import numpy as np
import numpy.typing as npt
class VariableType(Enum):
SCALAR = 1 # real or integer
COMPLEX = 2 # complex number - requires striding
TUPLE = 3 # tuples - require recursive handling
@dataclass
class Variable:
"""
This class represents a single output variable of a Stan model.
It contains information about the name, dimensions, and type of the
variable, as well as the indices of where that variable is located in
the flattened output array Stan models write.
Generally, this class should not be instantiated directly, but rather
created by the :func:`parse_header()` function.
"""
# name of the parameter as given in stan. For nested parameters, this is a dummy name
name: str
# where to start (resp. end) reading from the flattened array.
# For arrays with nested parameters, this will be for the first element
# and is relative to the start of the parent
start_idx: int
end_idx: int
# rectangular dimensions of the parameter (e.g. (2, 3) for a 2x3 matrix)
# For nested parameters, this will be the dimensions of the outermost array.
dimensions: Tuple[int, ...]
# type of the parameter
type: VariableType
# list of nested parameters
contents: List["Variable"]
def dtype(self, top: bool = True) -> np.dtype:
if self.type == VariableType.TUPLE:
elts = [
(str(i + 1), param.dtype(top=False))
for i, param in enumerate(self.contents)
]
dtype = np.dtype(elts)
elif self.type == VariableType.SCALAR:
dtype = np.float64
elif self.type == VariableType.COMPLEX:
dtype = np.complex128
if top:
return dtype
else:
return np.dtype((dtype, self.dimensions))
def columns(self) -> Iterable[int]:
return range(self.start_idx, self.end_idx)
def num_elts(self) -> int:
return prod(self.dimensions)
def elt_size(self) -> int:
return self.end_idx - self.start_idx
# total size is elt_size * num_elts
def _extract_helper(self, src: np.ndarray, offset: int = 0) -> np.ndarray:
start = self.start_idx + offset
end = self.end_idx + offset
if self.type == VariableType.SCALAR:
return src[..., start:end].reshape(-1, *self.dimensions, order="F")
elif self.type == VariableType.COMPLEX:
ret = src[..., start:end].reshape(-1, 2, *self.dimensions, order="F")
ret = ret[:, ::2] + 1j * ret[:, 1::2]
return ret.squeeze().reshape(-1, *self.dimensions, order="F")
elif self.type == VariableType.TUPLE:
out: np.ndarray = np.empty(
(prod(src.shape[:-1]), prod(self.dimensions)), dtype=object
)
for idx in range(self.num_elts()):
off = idx * self.elt_size() // self.num_elts()
elts = [
param._extract_helper(src, offset=start + off)
for param in self.contents
]
for i in range(elts[0].shape[0]):
out[i, idx] = tuple(elt[i] for elt in elts)
return out.reshape(-1, *self.dimensions, order="F")
def extract_reshape(self, src: np.ndarray, object: bool = True) -> npt.NDArray[Any]:
"""
Given an array where the final dimension is the flattened output of a
Stan model, (e.g. one row of a Stan CSV file), extract the variable
and reshape it to the correct type and dimensions.
This will most likely result in copies of the data being made if
the variable is not a scalar.
Parameters
----------
src : np.ndarray
The array to extract from.
Indicies besides the final dimension are preserved
in the output.
object : bool
If True, the output of tuple types will be an object array,
otherwise it will use custom dtypes to represent tuples.
Returns
-------
npt.NDArray[Any]
The extracted variable, reshaped to the correct dimensions.
If the variable is a tuple, this will be an object array,
otherwise it will have a dtype of either float64 or complex128.
"""
out = self._extract_helper(src)
if not object:
out = out.astype(self.dtype())
if src.ndim > 1:
out = out.reshape(*src.shape[:-1], *self.dimensions, order="F")
else:
out = out.squeeze(axis=0)
return out
def _munge_first_tuple(tup: str) -> str:
return "dummy_" + tup.split(":", 1)[1]
def _get_base_name(param: str) -> str:
return param.split(".")[0].split(":")[0]
def _from_header(header: str) -> List[Variable]:
# appending __dummy ensures one extra iteration in the later loop
header = header.strip() + ",__dummy"
entries = header.split(",")
params = []
start_idx = 0
name = _get_base_name(entries[0])
for i in range(0, len(entries) - 1):
entry = entries[i]
next_name = _get_base_name(entries[i + 1])
if next_name != name:
if ":" not in entry:
dims = entry.split(".")[1:]
if ".real" in entry or ".imag" in entry:
type = VariableType.COMPLEX
dims = dims[:-1]
else:
type = VariableType.SCALAR
params.append(
Variable(
name=name,
start_idx=start_idx,
end_idx=i + 1,
dimensions=tuple(map(int, dims)),
type=type,
contents=[],
)
)
else:
dims = entry.split(":")[0].split(".")[1:]
munged_header = ",".join(
dict.fromkeys(map(_munge_first_tuple, entries[start_idx : i + 1]))
)
params.append(
Variable(
name=name,
start_idx=start_idx,
end_idx=i + 1,
dimensions=tuple(map(int, dims)),
type=VariableType.TUPLE,
contents=_from_header(munged_header),
)
)
start_idx = i + 1
name = next_name
return params
def parse_header(header: str) -> Dict[str, Variable]:
"""
Given a comma-separated list of names of Stan outputs, like
that from the header row of a CSV file, parse it into a dictionary of
:class:`Variable` objects.
Parameters
----------
header : str
Comma separated list of Stan variables, including index information.
For example, an ``array[2] real foo` would be represented as
``foo.1,foo.2``.
Returns
-------
Dict[str, Variable]
A dictionary mapping the base name of each variable to a :class:`Variable`.
"""
return {param.name: param for param in _from_header(header)}
def stan_variables(
parameters: Dict[str, Variable],
source: npt.NDArray[np.float64],
*,
object: bool = True,
) -> Dict[str, npt.NDArray[Any]]:
"""
Given a dictionary of :class:`Variable` objects and a source array,
extract the variables from the source array and reshape them to the
correct dimensions.
Parameters
----------
parameters : Dict[str, Variable]
A dictionary of :class:`Variable` objects,
like that returned by :func:`parse_header()`.
source : npt.NDArray[np.float64]
The array to extract from.
object : bool
If True, the output of tuple types will be an object array,
otherwise it will use custom dtypes to represent tuples.
Returns
-------
Dict[str, npt.NDArray[Any]]
A dictionary mapping the base name of each variable to the extracted
and reshaped data.
"""
return {
param.name: param.extract_reshape(source, object=object)
for param in parameters.values()
}