reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/patsy/parse_formula.py
+++ b/venv/lib/python3.11/site-packages/patsy/parse_formula.py
@ -0,0 +1,295 @@
+# This file is part of Patsy
+# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
+# See file LICENSE.txt for license information.
+
+# This file defines a parser for a simple language based on S/R "formulas"
+# (which are described in sections 2.3 and 2.4 in Chambers & Hastie, 1992). It
+# uses the machinery in patsy.parse_core to do the heavy-lifting -- its
+# biggest job is to handle tokenization.
+
+
+__all__ = ["parse_formula"]
+
+# The Python tokenizer
+import tokenize
+
+from io import StringIO
+
+from patsy import PatsyError
+from patsy.origin import Origin
+from patsy.infix_parser import Token, Operator, infix_parse, ParseNode
+from patsy.tokens import python_tokenize, pretty_untokenize
+from patsy.util import PushbackAdapter
+
+_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"]
+
+
+def _is_a(f, v):
+    try:
+        f(v)
+    except ValueError:
+        return False
+    else:
+        return True
+
+
+# Helper function for _tokenize_formula:
+def _read_python_expr(it, end_tokens):
+    # Read out a full python expression, stopping when we hit an
+    # unnested end token.
+    pytypes = []
+    token_strings = []
+    origins = []
+    bracket_level = 0
+    for pytype, token_string, origin in it:
+        assert bracket_level >= 0
+        if bracket_level == 0 and token_string in end_tokens:
+            it.push_back((pytype, token_string, origin))
+            break
+        if token_string in ("(", "[", "{"):
+            bracket_level += 1
+        if token_string in (")", "]", "}"):
+            bracket_level -= 1
+        if bracket_level < 0:
+            raise PatsyError("unmatched close bracket", origin)
+        pytypes.append(pytype)
+        token_strings.append(token_string)
+        origins.append(origin)
+    # Either we found an end_token, or we hit the end of the string
+    if bracket_level == 0:
+        expr_text = pretty_untokenize(zip(pytypes, token_strings))
+        if expr_text == "0":
+            token_type = "ZERO"
+        elif expr_text == "1":
+            token_type = "ONE"
+        elif _is_a(int, expr_text) or _is_a(float, expr_text):
+            token_type = "NUMBER"
+        else:
+            token_type = "PYTHON_EXPR"
+        return Token(token_type, Origin.combine(origins), extra=expr_text)
+    else:
+        raise PatsyError(
+            "unclosed bracket in embedded Python " "expression", Origin.combine(origins)
+        )
+
+
+def _tokenize_formula(code, operator_strings):
+    assert "(" not in operator_strings
+    assert ")" not in operator_strings
+    magic_token_types = {
+        "(": Token.LPAREN,
+        ")": Token.RPAREN,
+    }
+    for operator_string in operator_strings:
+        magic_token_types[operator_string] = operator_string
+    # Once we enter a Python expression, a ( does not end it, but any other
+    # "magic" token does:
+    end_tokens = set(magic_token_types)
+    end_tokens.remove("(")
+
+    it = PushbackAdapter(python_tokenize(code))
+    for pytype, token_string, origin in it:
+        if token_string in magic_token_types:
+            yield Token(magic_token_types[token_string], origin)
+        else:
+            it.push_back((pytype, token_string, origin))
+            yield _read_python_expr(it, end_tokens)
+
+
+def test__tokenize_formula():
+    code = "y ~ a + (foo(b,c +   2)) + -1 + 0 + 10"
+    tokens = list(_tokenize_formula(code, ["+", "-", "~"]))
+    expecteds = [
+        ("PYTHON_EXPR", Origin(code, 0, 1), "y"),
+        ("~", Origin(code, 2, 3), None),
+        ("PYTHON_EXPR", Origin(code, 4, 5), "a"),
+        ("+", Origin(code, 6, 7), None),
+        (Token.LPAREN, Origin(code, 8, 9), None),
+        ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"),
+        (Token.RPAREN, Origin(code, 23, 24), None),
+        ("+", Origin(code, 25, 26), None),
+        ("-", Origin(code, 27, 28), None),
+        ("ONE", Origin(code, 28, 29), "1"),
+        ("+", Origin(code, 30, 31), None),
+        ("ZERO", Origin(code, 32, 33), "0"),
+        ("+", Origin(code, 34, 35), None),
+        ("NUMBER", Origin(code, 36, 38), "10"),
+    ]
+    for got, expected in zip(tokens, expecteds):
+        assert isinstance(got, Token)
+        assert got.type == expected[0]
+        assert got.origin == expected[1]
+        assert got.extra == expected[2]
+
+
+_unary_tilde = Operator("~", 1, -100)
+_default_ops = [
+    _unary_tilde,
+    Operator("~", 2, -100),
+    Operator("+", 2, 100),
+    Operator("-", 2, 100),
+    Operator("*", 2, 200),
+    Operator("/", 2, 200),
+    Operator(":", 2, 300),
+    Operator("**", 2, 500),
+    Operator("+", 1, 100),
+    Operator("-", 1, 100),
+]
+
+
+def parse_formula(code, extra_operators=[]):
+    if not code.strip():
+        code = "~ 1"
+
+    for op in extra_operators:
+        if op.precedence < 0:
+            raise ValueError("all operators must have precedence >= 0")
+
+    operators = _default_ops + extra_operators
+    operator_strings = [op.token_type for op in operators]
+    tree = infix_parse(
+        _tokenize_formula(code, operator_strings), operators, _atomic_token_types
+    )
+    if not isinstance(tree, ParseNode) or tree.type != "~":
+        tree = ParseNode("~", None, [tree], tree.origin)
+    return tree
+
+
+#############
+
+_parser_tests = {
+    "": ["~", "1"],
+    " ": ["~", "1"],
+    " \n ": ["~", "1"],
+    "1": ["~", "1"],
+    "a": ["~", "a"],
+    "a ~ b": ["~", "a", "b"],
+    "(a ~ b)": ["~", "a", "b"],
+    "a ~ ((((b))))": ["~", "a", "b"],
+    "a ~ ((((+b))))": ["~", "a", ["+", "b"]],
+    "a + b + c": ["~", ["+", ["+", "a", "b"], "c"]],
+    "a + (b ~ c) + d": ["~", ["+", ["+", "a", ["~", "b", "c"]], "d"]],
+    "a + np.log(a, base=10)": ["~", ["+", "a", "np.log(a, base=10)"]],
+    # Note different spacing:
+    "a + np . log(a , base = 10)": ["~", ["+", "a", "np.log(a, base=10)"]],
+    # Check precedence
+    "a + b ~ c * d": ["~", ["+", "a", "b"], ["*", "c", "d"]],
+    "a + b * c": ["~", ["+", "a", ["*", "b", "c"]]],
+    "-a**2": ["~", ["-", ["**", "a", "2"]]],
+    "-a:b": ["~", ["-", [":", "a", "b"]]],
+    "a + b:c": ["~", ["+", "a", [":", "b", "c"]]],
+    "(a + b):c": ["~", [":", ["+", "a", "b"], "c"]],
+    "a*b:c": ["~", ["*", "a", [":", "b", "c"]]],
+    "a+b / c": ["~", ["+", "a", ["/", "b", "c"]]],
+    "~ a": ["~", "a"],
+    "-1": ["~", ["-", "1"]],
+}
+
+
+def _compare_trees(got, expected):
+    assert isinstance(got, ParseNode)
+    if got.args:
+        assert got.type == expected[0]
+        for arg, expected_arg in zip(got.args, expected[1:]):
+            _compare_trees(arg, expected_arg)
+    else:
+        assert got.type in _atomic_token_types
+        assert got.token.extra == expected
+
+
+def _do_parse_test(test_cases, extra_operators):
+    for code, expected in test_cases.items():
+        actual = parse_formula(code, extra_operators=extra_operators)
+        print(repr(code), repr(expected))
+        print(actual)
+        _compare_trees(actual, expected)
+
+
+def test_parse_formula():
+    _do_parse_test(_parser_tests, [])
+
+
+def test_parse_origin():
+    tree = parse_formula("a ~ b + c")
+    assert tree.origin == Origin("a ~ b + c", 0, 9)
+    assert tree.token.origin == Origin("a ~ b + c", 2, 3)
+    assert tree.args[0].origin == Origin("a ~ b + c", 0, 1)
+    assert tree.args[1].origin == Origin("a ~ b + c", 4, 9)
+    assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7)
+    assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5)
+    assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
+
+
+# <> mark off where the error should be reported:
+_parser_error_tests = [
+    "a <+>",
+    "a + <(>",
+    "a + b <# asdf>",
+    "<)>",
+    "a + <)>",
+    "<*> a",
+    "a + <*>",
+    "a + <foo[bar>",
+    "a + <foo{bar>",
+    "a + <foo(bar>",
+    "a + <[bar>",
+    "a + <{bar>",
+    "a + <{bar[]>",
+    "a + foo<]>bar",
+    "a + foo[]<]>bar",
+    "a + foo{}<}>bar",
+    "a + foo<)>bar",
+    "a + b<)>",
+    "(a) <.>",
+    "<(>a + b",
+    "a +< >'foo",  # Not the best placement for the error
+]
+
+
+# Split out so it can also be used by tests of the evaluator (which also
+# raises PatsyError's)
+def _parsing_error_test(parse_fn, error_descs):  # pragma: no cover
+    for error_desc in error_descs:
+        letters = []
+        start = None
+        end = None
+        for letter in error_desc:
+            if letter == "<":
+                start = len(letters)
+            elif letter == ">":
+                end = len(letters)
+            else:
+                letters.append(letter)
+        bad_code = "".join(letters)
+        assert start is not None and end is not None
+        print(error_desc)
+        print(repr(bad_code), start, end)
+        try:
+            parse_fn(bad_code)
+        except PatsyError as e:
+            print(e)
+            assert e.origin.code == bad_code
+            assert e.origin.start in (0, start)
+            assert e.origin.end in (end, len(bad_code))
+        else:
+            assert False, "parser failed to report an error!"
+
+
+def test_parse_errors(extra_operators=[]):
+    def parse_fn(code):
+        return parse_formula(code, extra_operators=extra_operators)
+
+    _parsing_error_test(parse_fn, _parser_error_tests)
+
+
+_extra_op_parser_tests = {
+    "a | b": ["~", ["|", "a", "b"]],
+    "a * b|c": ["~", ["*", "a", ["|", "b", "c"]]],
+}
+
+
+def test_parse_extra_op():
+    extra_operators = [Operator("|", 2, 250)]
+    _do_parse_test(_parser_tests, extra_operators=extra_operators)
+    _do_parse_test(_extra_op_parser_tests, extra_operators=extra_operators)
+    test_parse_errors(extra_operators=extra_operators)