reconnect moved files to git repo
This commit is contained in:
295
venv/lib/python3.11/site-packages/patsy/parse_formula.py
Normal file
295
venv/lib/python3.11/site-packages/patsy/parse_formula.py
Normal file
@ -0,0 +1,295 @@
|
||||
# This file is part of Patsy
|
||||
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
|
||||
# See file LICENSE.txt for license information.
|
||||
|
||||
# This file defines a parser for a simple language based on S/R "formulas"
|
||||
# (which are described in sections 2.3 and 2.4 in Chambers & Hastie, 1992). It
|
||||
# uses the machinery in patsy.parse_core to do the heavy-lifting -- its
|
||||
# biggest job is to handle tokenization.
|
||||
|
||||
|
||||
__all__ = ["parse_formula"]
|
||||
|
||||
# The Python tokenizer
|
||||
import tokenize
|
||||
|
||||
from io import StringIO
|
||||
|
||||
from patsy import PatsyError
|
||||
from patsy.origin import Origin
|
||||
from patsy.infix_parser import Token, Operator, infix_parse, ParseNode
|
||||
from patsy.tokens import python_tokenize, pretty_untokenize
|
||||
from patsy.util import PushbackAdapter
|
||||
|
||||
_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"]
|
||||
|
||||
|
||||
def _is_a(f, v):
|
||||
try:
|
||||
f(v)
|
||||
except ValueError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
# Helper function for _tokenize_formula:
|
||||
def _read_python_expr(it, end_tokens):
|
||||
# Read out a full python expression, stopping when we hit an
|
||||
# unnested end token.
|
||||
pytypes = []
|
||||
token_strings = []
|
||||
origins = []
|
||||
bracket_level = 0
|
||||
for pytype, token_string, origin in it:
|
||||
assert bracket_level >= 0
|
||||
if bracket_level == 0 and token_string in end_tokens:
|
||||
it.push_back((pytype, token_string, origin))
|
||||
break
|
||||
if token_string in ("(", "[", "{"):
|
||||
bracket_level += 1
|
||||
if token_string in (")", "]", "}"):
|
||||
bracket_level -= 1
|
||||
if bracket_level < 0:
|
||||
raise PatsyError("unmatched close bracket", origin)
|
||||
pytypes.append(pytype)
|
||||
token_strings.append(token_string)
|
||||
origins.append(origin)
|
||||
# Either we found an end_token, or we hit the end of the string
|
||||
if bracket_level == 0:
|
||||
expr_text = pretty_untokenize(zip(pytypes, token_strings))
|
||||
if expr_text == "0":
|
||||
token_type = "ZERO"
|
||||
elif expr_text == "1":
|
||||
token_type = "ONE"
|
||||
elif _is_a(int, expr_text) or _is_a(float, expr_text):
|
||||
token_type = "NUMBER"
|
||||
else:
|
||||
token_type = "PYTHON_EXPR"
|
||||
return Token(token_type, Origin.combine(origins), extra=expr_text)
|
||||
else:
|
||||
raise PatsyError(
|
||||
"unclosed bracket in embedded Python " "expression", Origin.combine(origins)
|
||||
)
|
||||
|
||||
|
||||
def _tokenize_formula(code, operator_strings):
|
||||
assert "(" not in operator_strings
|
||||
assert ")" not in operator_strings
|
||||
magic_token_types = {
|
||||
"(": Token.LPAREN,
|
||||
")": Token.RPAREN,
|
||||
}
|
||||
for operator_string in operator_strings:
|
||||
magic_token_types[operator_string] = operator_string
|
||||
# Once we enter a Python expression, a ( does not end it, but any other
|
||||
# "magic" token does:
|
||||
end_tokens = set(magic_token_types)
|
||||
end_tokens.remove("(")
|
||||
|
||||
it = PushbackAdapter(python_tokenize(code))
|
||||
for pytype, token_string, origin in it:
|
||||
if token_string in magic_token_types:
|
||||
yield Token(magic_token_types[token_string], origin)
|
||||
else:
|
||||
it.push_back((pytype, token_string, origin))
|
||||
yield _read_python_expr(it, end_tokens)
|
||||
|
||||
|
||||
def test__tokenize_formula():
|
||||
code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10"
|
||||
tokens = list(_tokenize_formula(code, ["+", "-", "~"]))
|
||||
expecteds = [
|
||||
("PYTHON_EXPR", Origin(code, 0, 1), "y"),
|
||||
("~", Origin(code, 2, 3), None),
|
||||
("PYTHON_EXPR", Origin(code, 4, 5), "a"),
|
||||
("+", Origin(code, 6, 7), None),
|
||||
(Token.LPAREN, Origin(code, 8, 9), None),
|
||||
("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"),
|
||||
(Token.RPAREN, Origin(code, 23, 24), None),
|
||||
("+", Origin(code, 25, 26), None),
|
||||
("-", Origin(code, 27, 28), None),
|
||||
("ONE", Origin(code, 28, 29), "1"),
|
||||
("+", Origin(code, 30, 31), None),
|
||||
("ZERO", Origin(code, 32, 33), "0"),
|
||||
("+", Origin(code, 34, 35), None),
|
||||
("NUMBER", Origin(code, 36, 38), "10"),
|
||||
]
|
||||
for got, expected in zip(tokens, expecteds):
|
||||
assert isinstance(got, Token)
|
||||
assert got.type == expected[0]
|
||||
assert got.origin == expected[1]
|
||||
assert got.extra == expected[2]
|
||||
|
||||
|
||||
_unary_tilde = Operator("~", 1, -100)
|
||||
_default_ops = [
|
||||
_unary_tilde,
|
||||
Operator("~", 2, -100),
|
||||
Operator("+", 2, 100),
|
||||
Operator("-", 2, 100),
|
||||
Operator("*", 2, 200),
|
||||
Operator("/", 2, 200),
|
||||
Operator(":", 2, 300),
|
||||
Operator("**", 2, 500),
|
||||
Operator("+", 1, 100),
|
||||
Operator("-", 1, 100),
|
||||
]
|
||||
|
||||
|
||||
def parse_formula(code, extra_operators=[]):
|
||||
if not code.strip():
|
||||
code = "~ 1"
|
||||
|
||||
for op in extra_operators:
|
||||
if op.precedence < 0:
|
||||
raise ValueError("all operators must have precedence >= 0")
|
||||
|
||||
operators = _default_ops + extra_operators
|
||||
operator_strings = [op.token_type for op in operators]
|
||||
tree = infix_parse(
|
||||
_tokenize_formula(code, operator_strings), operators, _atomic_token_types
|
||||
)
|
||||
if not isinstance(tree, ParseNode) or tree.type != "~":
|
||||
tree = ParseNode("~", None, [tree], tree.origin)
|
||||
return tree
|
||||
|
||||
|
||||
#############
|
||||
|
||||
_parser_tests = {
|
||||
"": ["~", "1"],
|
||||
" ": ["~", "1"],
|
||||
" \n ": ["~", "1"],
|
||||
"1": ["~", "1"],
|
||||
"a": ["~", "a"],
|
||||
"a ~ b": ["~", "a", "b"],
|
||||
"(a ~ b)": ["~", "a", "b"],
|
||||
"a ~ ((((b))))": ["~", "a", "b"],
|
||||
"a ~ ((((+b))))": ["~", "a", ["+", "b"]],
|
||||
"a + b + c": ["~", ["+", ["+", "a", "b"], "c"]],
|
||||
"a + (b ~ c) + d": ["~", ["+", ["+", "a", ["~", "b", "c"]], "d"]],
|
||||
"a + np.log(a, base=10)": ["~", ["+", "a", "np.log(a, base=10)"]],
|
||||
# Note different spacing:
|
||||
"a + np . log(a , base = 10)": ["~", ["+", "a", "np.log(a, base=10)"]],
|
||||
# Check precedence
|
||||
"a + b ~ c * d": ["~", ["+", "a", "b"], ["*", "c", "d"]],
|
||||
"a + b * c": ["~", ["+", "a", ["*", "b", "c"]]],
|
||||
"-a**2": ["~", ["-", ["**", "a", "2"]]],
|
||||
"-a:b": ["~", ["-", [":", "a", "b"]]],
|
||||
"a + b:c": ["~", ["+", "a", [":", "b", "c"]]],
|
||||
"(a + b):c": ["~", [":", ["+", "a", "b"], "c"]],
|
||||
"a*b:c": ["~", ["*", "a", [":", "b", "c"]]],
|
||||
"a+b / c": ["~", ["+", "a", ["/", "b", "c"]]],
|
||||
"~ a": ["~", "a"],
|
||||
"-1": ["~", ["-", "1"]],
|
||||
}
|
||||
|
||||
|
||||
def _compare_trees(got, expected):
|
||||
assert isinstance(got, ParseNode)
|
||||
if got.args:
|
||||
assert got.type == expected[0]
|
||||
for arg, expected_arg in zip(got.args, expected[1:]):
|
||||
_compare_trees(arg, expected_arg)
|
||||
else:
|
||||
assert got.type in _atomic_token_types
|
||||
assert got.token.extra == expected
|
||||
|
||||
|
||||
def _do_parse_test(test_cases, extra_operators):
|
||||
for code, expected in test_cases.items():
|
||||
actual = parse_formula(code, extra_operators=extra_operators)
|
||||
print(repr(code), repr(expected))
|
||||
print(actual)
|
||||
_compare_trees(actual, expected)
|
||||
|
||||
|
||||
def test_parse_formula():
|
||||
_do_parse_test(_parser_tests, [])
|
||||
|
||||
|
||||
def test_parse_origin():
|
||||
tree = parse_formula("a ~ b + c")
|
||||
assert tree.origin == Origin("a ~ b + c", 0, 9)
|
||||
assert tree.token.origin == Origin("a ~ b + c", 2, 3)
|
||||
assert tree.args[0].origin == Origin("a ~ b + c", 0, 1)
|
||||
assert tree.args[1].origin == Origin("a ~ b + c", 4, 9)
|
||||
assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7)
|
||||
assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5)
|
||||
assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
|
||||
|
||||
|
||||
# <> mark off where the error should be reported:
|
||||
_parser_error_tests = [
|
||||
"a <+>",
|
||||
"a + <(>",
|
||||
"a + b <# asdf>",
|
||||
"<)>",
|
||||
"a + <)>",
|
||||
"<*> a",
|
||||
"a + <*>",
|
||||
"a + <foo[bar>",
|
||||
"a + <foo{bar>",
|
||||
"a + <foo(bar>",
|
||||
"a + <[bar>",
|
||||
"a + <{bar>",
|
||||
"a + <{bar[]>",
|
||||
"a + foo<]>bar",
|
||||
"a + foo[]<]>bar",
|
||||
"a + foo{}<}>bar",
|
||||
"a + foo<)>bar",
|
||||
"a + b<)>",
|
||||
"(a) <.>",
|
||||
"<(>a + b",
|
||||
"a +< >'foo", # Not the best placement for the error
|
||||
]
|
||||
|
||||
|
||||
# Split out so it can also be used by tests of the evaluator (which also
|
||||
# raises PatsyError's)
|
||||
def _parsing_error_test(parse_fn, error_descs): # pragma: no cover
|
||||
for error_desc in error_descs:
|
||||
letters = []
|
||||
start = None
|
||||
end = None
|
||||
for letter in error_desc:
|
||||
if letter == "<":
|
||||
start = len(letters)
|
||||
elif letter == ">":
|
||||
end = len(letters)
|
||||
else:
|
||||
letters.append(letter)
|
||||
bad_code = "".join(letters)
|
||||
assert start is not None and end is not None
|
||||
print(error_desc)
|
||||
print(repr(bad_code), start, end)
|
||||
try:
|
||||
parse_fn(bad_code)
|
||||
except PatsyError as e:
|
||||
print(e)
|
||||
assert e.origin.code == bad_code
|
||||
assert e.origin.start in (0, start)
|
||||
assert e.origin.end in (end, len(bad_code))
|
||||
else:
|
||||
assert False, "parser failed to report an error!"
|
||||
|
||||
|
||||
def test_parse_errors(extra_operators=[]):
|
||||
def parse_fn(code):
|
||||
return parse_formula(code, extra_operators=extra_operators)
|
||||
|
||||
_parsing_error_test(parse_fn, _parser_error_tests)
|
||||
|
||||
|
||||
_extra_op_parser_tests = {
|
||||
"a | b": ["~", ["|", "a", "b"]],
|
||||
"a * b|c": ["~", ["*", "a", ["|", "b", "c"]]],
|
||||
}
|
||||
|
||||
|
||||
def test_parse_extra_op():
|
||||
extra_operators = [Operator("|", 2, 250)]
|
||||
_do_parse_test(_parser_tests, extra_operators=extra_operators)
|
||||
_do_parse_test(_extra_op_parser_tests, extra_operators=extra_operators)
|
||||
test_parse_errors(extra_operators=extra_operators)
|
||||
Reference in New Issue
Block a user