Skip to content

Commit 885248c

Browse files
committed
Add sources for new parser and tokenizer
1 parent cd5767e commit 885248c

File tree

2 files changed

+308
-0
lines changed

2 files changed

+308
-0
lines changed

packaging/_parser.py

+155
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from typing import Any, List, Tuple
2+
3+
from ._tokenizer import Tokenizer
4+
5+
6+
def parse_named_requirement(requirement: str) -> Tuple[str, str, List[str], str, str]:
7+
"""
8+
NAMED_REQUIREMENT: NAME EXTRAS* URL_SPEC (SEMICOLON + MARKER)*
9+
"""
10+
tokens = Tokenizer(requirement)
11+
name = tokens.read("IDENTIFIER").text
12+
extras = parse_extras(tokens)
13+
specifier = ""
14+
url = ""
15+
if tokens.match("URL_SPEC"):
16+
url = tokens.read().text[1:].strip()
17+
elif not tokens.match("stringEnd"):
18+
specifier = parse_specifier(tokens)
19+
if tokens.match("SEMICOLON"):
20+
marker = ""
21+
while not tokens.match("stringEnd"):
22+
# we don't validate markers here, it's done later as part of
23+
# packaging/requirements.py
24+
marker += tokens.read().text
25+
else:
26+
marker = ""
27+
tokens.expect("stringEnd")
28+
return (name, url, extras, specifier, marker)
29+
30+
31+
def parse_extras(tokens: Tokenizer) -> List[str]:
32+
"""
33+
EXTRAS: (LBRACKET + IDENTIFIER + (COLON + IDENTIFIER)* + RBRACKET)*
34+
"""
35+
extras = []
36+
if tokens.try_read("LBRACKET"):
37+
while tokens.match("IDENTIFIER"):
38+
extras.append(tokens.read("IDENTIFIER").text)
39+
tokens.try_read("COLON")
40+
if not tokens.try_read("RBRACKET"):
41+
tokens.raise_syntax_error("Closing square bracket is missing")
42+
return extras
43+
44+
45+
def parse_specifier(tokens: Tokenizer) -> str:
46+
"""
47+
SPECIFIER: LPAREN (OP + VERSION + COLON)+ RPAREN | OP + VERSION
48+
"""
49+
parsed_specifiers = ""
50+
lparen = False
51+
if tokens.try_read("LPAREN"):
52+
lparen = True
53+
while tokens.match("OP"):
54+
parsed_specifiers += tokens.read("OP").text
55+
if tokens.match("VERSION"):
56+
parsed_specifiers += tokens.read("VERSION").text
57+
else:
58+
tokens.raise_syntax_error("Missing version")
59+
if tokens.match("COLON"):
60+
parsed_specifiers += tokens.read("COLON").text
61+
if lparen and not tokens.try_read("RPAREN"):
62+
tokens.raise_syntax_error("Closing right parenthesis is missing")
63+
return parsed_specifiers
64+
65+
66+
def parse_quoted_marker(tokens: Tokenizer) -> List[str]:
67+
tokens.try_read("SEMICOLON")
68+
return parse_marker_expr(tokens)
69+
70+
71+
def parse_marker_expr(tokens: Tokenizer) -> List[str]:
72+
"""
73+
MARKER_EXPR: MARKER_ATOM (BOOLOP + MARKER_ATOM)+
74+
"""
75+
expression = [parse_marker_atom(tokens)]
76+
while tokens.match("BOOLOP"):
77+
tok = tokens.try_read("BOOLOP")
78+
expr_right = parse_marker_atom(tokens)
79+
expression.extend((tok.text, expr_right))
80+
return expression
81+
82+
83+
def parse_marker_atom(tokens: Tokenizer) -> Any:
84+
"""
85+
MARKER_ATOM: LPAREN MARKER_EXPR RPAREN | MARKER_ITEM
86+
"""
87+
if tokens.try_read("LPAREN"):
88+
marker = parse_marker_expr(tokens)
89+
if not tokens.try_read("RPAREN"):
90+
tokens.raise_syntax_error("Closing right parenthesis is missing")
91+
return marker
92+
else:
93+
return parse_marker_item(tokens)
94+
95+
96+
def parse_marker_item(tokens: Tokenizer) -> Tuple[Any, Any, Any]:
97+
"""
98+
MARKER_ITEM: MARKER_VAR MARKER_OP MARKER_VAR
99+
"""
100+
marker_var_left = parse_marker_var(tokens)
101+
marker_op = parse_marker_op(tokens)
102+
marker_var_right = parse_marker_var(tokens)
103+
return (marker_var_left, marker_op, marker_var_right)
104+
105+
106+
def parse_marker_var(tokens: Tokenizer) -> Any:
107+
"""
108+
MARKER_VAR: VARIABLE MARKER_VALUE
109+
"""
110+
if tokens.match("VARIABLE"):
111+
return parse_variable(tokens)
112+
else:
113+
return parse_python_str(tokens)
114+
115+
116+
def parse_variable(tokens: Tokenizer) -> Any:
117+
from .markers import Variable
118+
119+
env_var = tokens.read("VARIABLE").text.replace(".", "_")
120+
if (
121+
env_var == "platform_python_implementation"
122+
or env_var == "python_implementation"
123+
):
124+
return Variable("platform_python_implementation")
125+
else:
126+
return Variable(env_var)
127+
128+
129+
def parse_python_str(tokens: Tokenizer) -> Any:
130+
from .markers import Value
131+
132+
if tokens.match("QUOTED_STRING"):
133+
python_str = tokens.read().text.strip("\'\"")
134+
return Value(str(python_str))
135+
else:
136+
return tokens.raise_syntax_error(
137+
"String with single or double quote at the beginning is expected"
138+
)
139+
140+
141+
def parse_marker_op(tokens: Tokenizer) -> Any:
142+
from .markers import Op
143+
144+
if tokens.try_read("IN"):
145+
return Op("in")
146+
elif tokens.try_read("NOT"):
147+
tokens.read("IN")
148+
return Op("not in")
149+
elif tokens.match("OP"):
150+
return Op(tokens.read().text)
151+
else:
152+
return tokens.raise_syntax_error(
153+
'Couldn\'t parse marker operator. Expecting one of \
154+
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"'
155+
)

packaging/_tokenizer.py

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import re
2+
from typing import Any, Dict, Generator, Optional, Sized
3+
4+
from .specifiers import Specifier
5+
6+
7+
class Token:
8+
def __init__(self, name: str, text: str, position: int):
9+
self.name = name
10+
self.text = text
11+
self.position = position
12+
13+
def matches(self, name: str = "", text: str = "") -> bool:
14+
if name and self.name != name:
15+
return False
16+
return True
17+
18+
19+
class ParseException(Exception):
20+
"""Parsing failed"""
21+
22+
def __init__(self, message: str, position: int) -> None:
23+
super().__init__(message)
24+
self.position = position
25+
26+
27+
DEFAULT_RULES = {
28+
None: r"[ \t]+", # whitespace: not returned as tokens
29+
"LPAREN": r"\(",
30+
"RPAREN": r"\)",
31+
"LBRACKET": r"\[",
32+
"RBRACKET": r"\]",
33+
"SEMICOLON": r";",
34+
"COLON": r",",
35+
"QUOTED_STRING": re.compile(
36+
r"""
37+
('[^']*')
38+
|
39+
("[^"]*")
40+
""",
41+
re.VERBOSE,
42+
),
43+
"OP": r"===|==|~=|!=|<=|>=|<|>",
44+
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE),
45+
"BOOLOP": r"or|and",
46+
"IN": r"in",
47+
"NOT": r"not",
48+
"VARIABLE": re.compile(
49+
r"""
50+
python_version
51+
|python_full_version
52+
|os[._]name
53+
|sys[._]platform
54+
|platform_(release|system)
55+
|platform[._](version|machine|python_implementation)
56+
|python_implementation
57+
|implementation_(name|version)
58+
|extra
59+
""",
60+
re.VERBOSE,
61+
),
62+
"URL_SPEC": "@ *[^ ]+",
63+
"IDENTIFIER": r"([a-zA-Z0-9]|-|_|\.)+",
64+
}
65+
66+
67+
class Tokenizer:
68+
"""Stream of tokens for a LL(1) parser.
69+
70+
Provides methods to examine the next token to be read, and to read it
71+
(advance to the next token).
72+
"""
73+
74+
next_token: Any
75+
76+
def __init__(
77+
self, source: Sized, rules: Dict[Optional[str], object] = DEFAULT_RULES
78+
) -> None:
79+
self.source = source
80+
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()}
81+
self.next_token = None
82+
self.generator = self._tokenize()
83+
self.position = 0
84+
85+
def peek(self, *match_args: None, **match_kwargs: None) -> Any:
86+
"""Return the next token to be read"""
87+
if not self.next_token:
88+
self.next_token = next(self.generator)
89+
return self.next_token
90+
91+
def match(self, *match_args: str, **match_kwargs: None) -> Any:
92+
"""Return True if the next token matches the given arguments"""
93+
token = self.peek()
94+
return token.matches(*match_args, **match_kwargs)
95+
96+
def expect(self, *match_args: str, **match_kwargs: None) -> Any:
97+
"""Raise SyntaxError if the next token doesn't match given arguments"""
98+
token = self.peek()
99+
if not token.matches(*match_args, **match_kwargs):
100+
exp = " ".join(
101+
v
102+
for v in match_args
103+
+ tuple(f"{k}={v!r}" for k, v in match_kwargs.items())
104+
if v
105+
)
106+
raise self.raise_syntax_error(f"Expected {exp}")
107+
return token
108+
109+
def read(self, *match_args: str, **match_kwargs: None) -> Any:
110+
"""Return the next token and advance to the next token
111+
112+
Raise SyntaxError if the token doesn't match.
113+
"""
114+
result = self.expect(*match_args, **match_kwargs)
115+
self.next_token = None
116+
return result
117+
118+
def try_read(self, *match_args: str, **match_kwargs: None) -> Any:
119+
"""read() if the next token matches the given arguments
120+
121+
Do nothing if it does not match.
122+
"""
123+
if self.match(*match_args, **match_kwargs):
124+
return self.read()
125+
126+
def raise_syntax_error(self, message: str = "Invalid marker") -> Any:
127+
"""Raise SyntaxError at the given position in the marker"""
128+
at = f"at position {self.position}:"
129+
marker = " " * self.position + "^"
130+
raise ParseException(
131+
f"{message}\n{at}\n {self.source}\n {marker}",
132+
self.position,
133+
)
134+
135+
def _make_token(self, name: str, text: str) -> Token:
136+
"""Make a token with the current position"""
137+
return Token(name, text, self.position)
138+
139+
def _tokenize(self) -> Generator[Token, Token, None]:
140+
"""The main generator of tokens"""
141+
while self.position < len(self.source):
142+
for name, expression in self.rules.items():
143+
match = expression.match(self.source, self.position)
144+
if match:
145+
token_text = match[0]
146+
147+
if name:
148+
yield self._make_token(name, token_text)
149+
self.position += len(token_text)
150+
break
151+
else:
152+
raise self.raise_syntax_error()
153+
yield self._make_token("stringEnd", "")

0 commit comments

Comments
 (0)