some new features
This commit is contained in:
359
.venv/lib/python3.12/site-packages/Cython/Plex/Scanners.py
Normal file
359
.venv/lib/python3.12/site-packages/Cython/Plex/Scanners.py
Normal file
@ -0,0 +1,359 @@
|
||||
# cython: language_level=3str
|
||||
# cython: auto_pickle=False
|
||||
"""
|
||||
Python Lexical Analyser
|
||||
|
||||
Scanning an input stream
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
|
||||
import cython
|
||||
|
||||
cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object) # noqa:E402
|
||||
|
||||
from . import Errors
|
||||
from .Regexps import BOL, EOL, EOF
|
||||
|
||||
NOT_FOUND = object()
|
||||
|
||||
|
||||
class Scanner(object):
|
||||
"""
|
||||
A Scanner is used to read tokens from a stream of characters
|
||||
using the token set specified by a Plex.Lexicon.
|
||||
|
||||
Constructor:
|
||||
|
||||
Scanner(lexicon, stream, name = '')
|
||||
|
||||
See the docstring of the __init__ method for details.
|
||||
|
||||
Methods:
|
||||
|
||||
See the docstrings of the individual methods for more
|
||||
information.
|
||||
|
||||
read() --> (value, text)
|
||||
Reads the next lexical token from the stream.
|
||||
|
||||
position() --> (name, line, col)
|
||||
Returns the position of the last token read using the
|
||||
read() method.
|
||||
|
||||
begin(state_name)
|
||||
Causes scanner to change state.
|
||||
|
||||
produce(value [, text])
|
||||
Causes return of a token value to the caller of the
|
||||
Scanner.
|
||||
|
||||
"""
|
||||
|
||||
# lexicon = None # Lexicon
|
||||
# stream = None # file-like object
|
||||
# name = ''
|
||||
# buffer = ''
|
||||
#
|
||||
# These positions are used by the scanner to track its internal state:
|
||||
# buf_start_pos = 0 # position in input of start of buffer
|
||||
# next_pos = 0 # position in input of next char to read
|
||||
# cur_pos = 0 # position in input of current char
|
||||
# cur_line = 1 # line number of current char
|
||||
# cur_line_start = 0 # position in input of start of current line
|
||||
# start_pos = 0 # position in input of start of token
|
||||
# current_scanner_position_tuple = ("", 0, 0)
|
||||
# tuple of filename, line number and position in line, really mainly for error reporting
|
||||
#
|
||||
# These positions are used to track what was read from the queue
|
||||
# (which may differ from the internal state when tokens are replaced onto the queue)
|
||||
# last_token_position_tuple = ("", 0, 0) # tuple of filename, line number and position in line
|
||||
|
||||
# text = None # text of last token read
|
||||
# initial_state = None # Node
|
||||
# state_name = '' # Name of initial state
|
||||
# queue = None # list of tokens and positions to be returned
|
||||
# trace = 0
|
||||
|
||||
def __init__(self, lexicon, stream, name='', initial_pos=None):
|
||||
"""
|
||||
Scanner(lexicon, stream, name = '')
|
||||
|
||||
|lexicon| is a Plex.Lexicon instance specifying the lexical tokens
|
||||
to be recognised.
|
||||
|
||||
|stream| can be a file object or anything which implements a
|
||||
compatible read() method.
|
||||
|
||||
|name| is optional, and may be the name of the file being
|
||||
scanned or any other identifying string.
|
||||
"""
|
||||
self.trace = 0
|
||||
|
||||
self.buffer = u''
|
||||
self.buf_start_pos = 0
|
||||
self.next_pos = 0
|
||||
self.cur_pos = 0
|
||||
self.cur_line = 1
|
||||
self.start_pos = 0
|
||||
self.current_scanner_position_tuple = ("", 0, 0)
|
||||
self.last_token_position_tuple = ("", 0, 0)
|
||||
self.text = None
|
||||
self.state_name = None
|
||||
|
||||
self.lexicon = lexicon
|
||||
self.stream = stream
|
||||
self.name = name
|
||||
self.queue = []
|
||||
self.initial_state = None
|
||||
self.begin('')
|
||||
self.next_pos = 0
|
||||
self.cur_pos = 0
|
||||
self.cur_line_start = 0
|
||||
self.cur_char = BOL
|
||||
self.input_state = 1
|
||||
if initial_pos is not None:
|
||||
self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
Read the next lexical token from the stream and return a
|
||||
tuple (value, text), where |value| is the value associated with
|
||||
the token as specified by the Lexicon, and |text| is the actual
|
||||
string read from the stream. Returns (None, '') on end of file.
|
||||
"""
|
||||
queue = self.queue
|
||||
while not queue:
|
||||
self.text, action = self.scan_a_token()
|
||||
if action is None:
|
||||
self.produce(None)
|
||||
self.eof()
|
||||
else:
|
||||
value = action.perform(self, self.text)
|
||||
if value is not None:
|
||||
self.produce(value)
|
||||
result, self.last_token_position_tuple = queue[0]
|
||||
del queue[0]
|
||||
return result
|
||||
|
||||
def unread(self, token, value, position):
|
||||
self.queue.insert(0, ((token, value), position))
|
||||
|
||||
def get_current_scan_pos(self):
|
||||
# distinct from the position of the last token due to the queue
|
||||
return self.current_scanner_position_tuple
|
||||
|
||||
def scan_a_token(self):
|
||||
"""
|
||||
Read the next input sequence recognised by the machine
|
||||
and return (text, action). Returns ('', None) on end of
|
||||
file.
|
||||
"""
|
||||
self.start_pos = self.cur_pos
|
||||
self.current_scanner_position_tuple = (
|
||||
self.name, self.cur_line, self.cur_pos - self.cur_line_start
|
||||
)
|
||||
action = self.run_machine_inlined()
|
||||
if action is not None:
|
||||
if self.trace:
|
||||
print("Scanner: read: Performing %s %d:%d" % (
|
||||
action, self.start_pos, self.cur_pos))
|
||||
text = self.buffer[
|
||||
self.start_pos - self.buf_start_pos:
|
||||
self.cur_pos - self.buf_start_pos]
|
||||
return (text, action)
|
||||
else:
|
||||
if self.cur_pos == self.start_pos:
|
||||
if self.cur_char is EOL:
|
||||
self.next_char()
|
||||
if self.cur_char is None or self.cur_char is EOF:
|
||||
return (u'', None)
|
||||
raise Errors.UnrecognizedInput(self, self.state_name)
|
||||
|
||||
def run_machine_inlined(self):
|
||||
"""
|
||||
Inlined version of run_machine for speed.
|
||||
"""
|
||||
state = self.initial_state
|
||||
cur_pos = self.cur_pos
|
||||
cur_line = self.cur_line
|
||||
cur_line_start = self.cur_line_start
|
||||
cur_char = self.cur_char
|
||||
input_state = self.input_state
|
||||
next_pos = self.next_pos
|
||||
buffer = self.buffer
|
||||
buf_start_pos = self.buf_start_pos
|
||||
buf_len = len(buffer)
|
||||
b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
|
||||
None, 0, 0, 0, u'', 0, 0
|
||||
|
||||
trace = self.trace
|
||||
while 1:
|
||||
if trace:
|
||||
print("State %d, %d/%d:%s -->" % (
|
||||
state['number'], input_state, cur_pos, repr(cur_char)))
|
||||
|
||||
# Begin inlined self.save_for_backup()
|
||||
action = state['action']
|
||||
if action is not None:
|
||||
b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
|
||||
action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
|
||||
# End inlined self.save_for_backup()
|
||||
|
||||
c = cur_char
|
||||
new_state = state.get(c, NOT_FOUND)
|
||||
if new_state is NOT_FOUND:
|
||||
new_state = c and state.get('else')
|
||||
|
||||
if new_state:
|
||||
if trace:
|
||||
print("State %d" % new_state['number'])
|
||||
state = new_state
|
||||
# Begin inlined: self.next_char()
|
||||
if input_state == 1:
|
||||
cur_pos = next_pos
|
||||
# Begin inlined: c = self.read_char()
|
||||
buf_index = next_pos - buf_start_pos
|
||||
if buf_index < buf_len:
|
||||
c = buffer[buf_index]
|
||||
next_pos += 1
|
||||
else:
|
||||
discard = self.start_pos - buf_start_pos
|
||||
data = self.stream.read(0x1000)
|
||||
buffer = self.buffer[discard:] + data
|
||||
self.buffer = buffer
|
||||
buf_start_pos += discard
|
||||
self.buf_start_pos = buf_start_pos
|
||||
buf_len = len(buffer)
|
||||
buf_index -= discard
|
||||
if data:
|
||||
c = buffer[buf_index]
|
||||
next_pos += 1
|
||||
else:
|
||||
c = u''
|
||||
# End inlined: c = self.read_char()
|
||||
if c == u'\n':
|
||||
cur_char = EOL
|
||||
input_state = 2
|
||||
elif not c:
|
||||
cur_char = EOL
|
||||
input_state = 4
|
||||
else:
|
||||
cur_char = c
|
||||
elif input_state == 2: # after EoL (1) -> BoL (3)
|
||||
cur_char = u'\n'
|
||||
input_state = 3
|
||||
elif input_state == 3: # start new code line
|
||||
cur_line += 1
|
||||
cur_line_start = cur_pos = next_pos
|
||||
cur_char = BOL
|
||||
input_state = 1
|
||||
elif input_state == 4: # after final line (1) -> EoF (5)
|
||||
cur_char = EOF
|
||||
input_state = 5
|
||||
else: # input_state == 5 (EoF)
|
||||
cur_char = u''
|
||||
# End inlined self.next_char()
|
||||
else: # not new_state
|
||||
if trace:
|
||||
print("blocked")
|
||||
# Begin inlined: action = self.back_up()
|
||||
if b_action is not None:
|
||||
(action, cur_pos, cur_line, cur_line_start,
|
||||
cur_char, input_state, next_pos) = \
|
||||
(b_action, b_cur_pos, b_cur_line, b_cur_line_start,
|
||||
b_cur_char, b_input_state, b_next_pos)
|
||||
else:
|
||||
action = None
|
||||
break # while 1
|
||||
# End inlined: action = self.back_up()
|
||||
|
||||
self.cur_pos = cur_pos
|
||||
self.cur_line = cur_line
|
||||
self.cur_line_start = cur_line_start
|
||||
self.cur_char = cur_char
|
||||
self.input_state = input_state
|
||||
self.next_pos = next_pos
|
||||
if trace:
|
||||
if action is not None:
|
||||
print("Doing %s" % action)
|
||||
return action
|
||||
|
||||
def next_char(self):
|
||||
input_state = self.input_state
|
||||
if self.trace:
|
||||
print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
|
||||
if input_state == 1:
|
||||
self.cur_pos = self.next_pos
|
||||
c = self.read_char()
|
||||
if c == u'\n':
|
||||
self.cur_char = EOL
|
||||
self.input_state = 2
|
||||
elif not c:
|
||||
self.cur_char = EOL
|
||||
self.input_state = 4
|
||||
else:
|
||||
self.cur_char = c
|
||||
elif input_state == 2:
|
||||
self.cur_char = u'\n'
|
||||
self.input_state = 3
|
||||
elif input_state == 3:
|
||||
self.cur_line += 1
|
||||
self.cur_line_start = self.cur_pos = self.next_pos
|
||||
self.cur_char = BOL
|
||||
self.input_state = 1
|
||||
elif input_state == 4:
|
||||
self.cur_char = EOF
|
||||
self.input_state = 5
|
||||
else: # input_state = 5
|
||||
self.cur_char = u''
|
||||
if self.trace:
|
||||
print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char))
|
||||
|
||||
def position(self):
|
||||
"""
|
||||
Return a tuple (name, line, col) representing the location of
|
||||
the last token read using the read() method. |name| is the
|
||||
name that was provided to the Scanner constructor; |line|
|
||||
is the line number in the stream (1-based); |col| is the
|
||||
position within the line of the first character of the token
|
||||
(0-based).
|
||||
"""
|
||||
return self.last_token_position_tuple
|
||||
|
||||
def get_position(self):
|
||||
"""
|
||||
Python accessible wrapper around position(), only for error reporting.
|
||||
"""
|
||||
return self.position()
|
||||
|
||||
def begin(self, state_name):
|
||||
"""Set the current state of the scanner to the named state."""
|
||||
self.initial_state = (
|
||||
self.lexicon.get_initial_state(state_name))
|
||||
self.state_name = state_name
|
||||
|
||||
def produce(self, value, text=None):
|
||||
"""
|
||||
Called from an action procedure, causes |value| to be returned
|
||||
as the token value from read(). If |text| is supplied, it is
|
||||
returned in place of the scanned text.
|
||||
|
||||
produce() can be called more than once during a single call to an action
|
||||
procedure, in which case the tokens are queued up and returned one
|
||||
at a time by subsequent calls to read(), until the queue is empty,
|
||||
whereupon scanning resumes.
|
||||
"""
|
||||
if text is None:
|
||||
text = self.text
|
||||
self.queue.append(((value, text), self.current_scanner_position_tuple))
|
||||
|
||||
def eof(self):
|
||||
"""
|
||||
Override this method if you want something to be done at
|
||||
end of file.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def start_line(self):
|
||||
return self.last_token_position_tuple[1]
|
||||
Reference in New Issue
Block a user