-
Notifications
You must be signed in to change notification settings - Fork 5
/
docstring_parsers.py
435 lines (359 loc) · 14.8 KB
/
docstring_parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
"""
Docstring parsers.
Translates from the [ReST docstring format (Sphinx)](
https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html#the-sphinx-docstring-format)
Translates from the [numpydoc docstring format](https://numpydoc.readthedocs.io/en/latest/format.html)
"""
from collections import namedtuple
from functools import partial
from operator import contains
from typing import Tuple, List, Dict
from docstring_parser import Style
from doctrans.emitter_utils import interpolate_defaults
from doctrans.pure_utils import location_within
TOKENS = namedtuple("Tokens", ("rest", "google", "numpydoc"))(
(":param", ":cvar", ":ivar", ":var", ":type", ":return", ":rtype"),
("Args:", "Kwargs:", "Returns:", "Raises:"),
("Parameters\n----------", "Returns\n-------"),
)
RETURN_TOKENS = namedtuple("Tokens", ("rest", "google", "numpydoc"))(
TOKENS.rest[-2:], (TOKENS.google[-1],), (TOKENS.numpydoc[-1],)
)
def parse_docstring(docstring, emit_default_doc=False):
"""Parse the docstring into its components.
:param docstring: the docstring
:type docstring: ```Optional[str]```
:param emit_default_doc: Whether help/docstring should include 'With default' text
:type emit_default_doc: ```bool``
:return: a dictionary of form
{
'name': ...,
'type': ...,
'doc': ...,
'params': [{'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }, ...],
'returns': {'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }
}
:rtype: ```dict```
"""
assert isinstance(docstring, (type(None), str)), "{typ} != str".format(
typ=type(docstring).__name__
)
if docstring is None or any(map(partial(contains, docstring), TOKENS.rest)):
style = Style.rest
elif any(map(partial(contains, docstring), TOKENS.google)):
style = Style.google
else:
style = Style.numpydoc
# if style is Style.numpydoc:
# raise NotImplementedError()
# elif style is not Style.rest:
#
# def process_param(param):
# """
# Postprocess the param
#
# :param param: dict of shape {'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }
# :type param: ```dict``
#
# :return: Potentially changed param
# :rtype: ```dict```
# """
# if "type_name" in param:
# param["typ"] = param.pop("type_name")
# elif param["name"].endswith("kwargs"):
# param.update({"typ": "dict", "name": param["name"].lstrip("*")})
# if "is_optional" in param:
# if param["is_optional"] and "optional" not in param["typ"].lower():
# param["typ"] = "Optional[{}]".format(param["typ"])
# del param["is_optional"]
# return param
#
# ir = parse.docstring_parser(docstring_parser_(docstring, style=style))
# ir.update(
# {
# "params": list(map(process_param, ir["params"])),
# "type": {"self": "self", "cls": "cls"}.get(
# ir["params"][0]["name"] if ir["params"] else None, "static"
# ),
# }
# )
# if ir.get("returns"):
# ir["returns"]["name"] = "return_type"
# ir["returns"]["doc"], ir["returns"]["default"] = extract_default(
# ir["returns"]["doc"], emit_default_doc=emit_default_doc
# )
# del ir["raises"]
# return ir
ir = {
"name": None,
"type": "static",
"doc": "",
"params": [],
"returns": None,
}
if not docstring:
return ir
scanned = _scan_phase(docstring, style=style)
_parse_phase(ir, scanned, emit_default_doc, style=style)
return ir
def _scan_phase(docstring, style=Style.rest):
"""
Scanner phase. Lexical analysis; to some degree…
:param docstring: the docstring
:type docstring: ```str```
:param style: the style of docstring
:type style: ```Style```
:return: List with each element a tuple of (whether value is a token, value)
:rtype: ```Union[Dict[str, str], List[Tuple[bool, str]]]```
"""
known_tokens = getattr(TOKENS, style.name)
if style is Style.rest:
return _scan_phase_rest(docstring, known_tokens=known_tokens)
elif style is Style.numpydoc:
return _scan_phase_numpydoc(docstring, known_tokens=known_tokens)
else:
raise NotImplementedError(Style.name)
def _scan_phase_numpydoc(docstring, known_tokens):
"""
numpydoc scanner phase. Lexical analysis; to some degree…
:param docstring: the docstring
:type docstring: ```str```
:param known_tokens: Valid tokens like `"Parameters\n----------"`
:type known_tokens: ```Tuple[str]```
:return: List with each element a tuple of (whether value is a token, value)
:rtype: ```Dict[str, str]```
"""
scanned: Dict[str, List[dict]] = {token: [] for token in ("doc",) + known_tokens}
# ^ Dict[Union[Literal["doc"], known_tokens], List[dict]]
# First doc, if present
_start_idx, _end_idx, _found = location_within(docstring, (known_tokens[0],))
if _start_idx == -1:
# Return type no args?
_start_idx, _end_idx, _found = location_within(docstring, (known_tokens[1],))
if _start_idx > -1:
namespace = _found
scanned["doc"] = docstring[:_start_idx].strip()
docstring = docstring[_end_idx:].strip()
else:
scanned["doc"] = docstring.strip()
return scanned
def parse_return(typ, _, doc):
"""
Internal function to parse `str.partition` output into a return param
:param typ: the type
:type typ: ```str```
:param _: Ignore this. It should be a newline character.
:type _: ```str```
:param doc: the doc
:type doc: ```str```
:return: dict of shape {'name': ..., 'typ': ..., 'doc': ... }
:rtype: ```dict``
"""
return {"name": "return_type", "typ": typ, "doc": doc.lstrip()}
if namespace == known_tokens[0]:
_start_idx, _end_idx, _found = location_within(docstring, (known_tokens[1],))
if _start_idx > -1:
ret_docstring = docstring[_end_idx:].lstrip()
docstring = docstring[:_start_idx]
scanned[_found] = parse_return(*ret_docstring.partition("\n"))
# Next, separate into (namespace, name, [typ, doc, default]), updating `scanned` accordingly
_parse_params_from_numpydoc(docstring, namespace, scanned)
else:
scanned[known_tokens[1]] = parse_return(*docstring.partition("\n"))
return scanned
def _parse_params_from_numpydoc(docstring, namespace, scanned):
"""
Internal function used by `_scan_phase_numpydoc` to extract the params into the doctrans ir
:param docstring: The docstring in numpydoc format
:type docstring: ```str```
:param namespace: The namespace, i.e., the key to update on the `scanned` param.
:type namespace: ```Literal["Parameters\n----------"]```
:param scanned: A list of dicts in docstring IR format, but with an outermost key of numpydoc known tokens
:type scanned: ```Dict[str, List[dict]]```
"""
stack, cur, col_on_line = [], {}, False
for idx, ch in enumerate(docstring):
stack.append(ch)
if ch == "\n":
stack_str = "".join(stack).strip()
if stack_str:
if col_on_line is True:
col_on_line = False
# cur["rest"] += stack_str
cur["typ"] = stack_str
else:
if cur:
cur["doc"] = stack_str
else:
cur = {"doc": stack_str}
stack.clear()
elif ch == ":":
if "name" in cur:
scanned[namespace].append(cur.copy())
cur.clear()
stack_str = "".join(stack[:-1]).strip()
if stack_str:
cur = {"name": stack_str, "doc": ""}
col_on_line = True
stack.clear()
if cur:
scanned[namespace].append(cur)
def _scan_phase_rest(docstring, known_tokens):
"""
Scanner phase. Lexical analysis; to some degree…
:param docstring: the docstring
:type docstring: ```str```
:param known_tokens: Valid tokens like `:param`
:type known_tokens: ```Tuple[str]```
:return: List with each element a tuple of (whether value is a token, value)
:rtype: ```List[Tuple[bool, str]]```
"""
rev_known_tokens_t = tuple(map(tuple, map(reversed, known_tokens)))
scanned: List[Tuple[bool, str]] = []
stack: List[str] = []
for ch in docstring:
stack.append(ch)
stack_rev = stack[::-1]
for token in rev_known_tokens_t:
token_len = len(token)
if tuple(stack_rev[:token_len]) == token:
scanned.append((bool(len(scanned)), "".join(stack[:-token_len])))
stack = stack[len(scanned[-1][1]) :][:token_len]
continue
if stack:
final = "".join(stack)
scanned.append(
(
bool(scanned and scanned[-1][0])
or any(map(final.startswith, known_tokens)),
final,
)
)
return scanned
def _parse_phase(intermediate_repr, scanned, emit_default_doc, style=Style.rest):
"""
:param intermediate_repr: a dictionary of form
{
'name': ...,
'type': ...,
'doc': ...,
'params': [{'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }, ...],
'returns': {'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }
}
:type intermediate_repr: ```dict```
:param scanned: List with each element a tuple of (whether value is a token, value)
:type scanned: ```Union[Dict[str, str], List[Tuple[bool, str]]]```
:param style: the style of docstring
:type style: ```Style```
:param emit_default_doc: Whether help/docstring should include 'With default' text
:type emit_default_doc: ```bool``
"""
return_tokens = getattr(RETURN_TOKENS, style.name)
if style is Style.rest:
return _parse_phase_rest(
intermediate_repr, scanned, emit_default_doc, return_tokens
)
elif style is Style.numpydoc:
return _parse_phase_numpydoc(
intermediate_repr, scanned, emit_default_doc, return_tokens
)
else:
raise NotImplementedError(style.name)
def _parse_phase_numpydoc(intermediate_repr, scanned, emit_default_doc, return_tokens):
"""
:param intermediate_repr: a dictionary of form
{
'name': ...,
'type': ...,
'doc': ...,
'params': [{'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }, ...],
'returns': {'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }
}
:type intermediate_repr: ```dict```
:param scanned: List with each element a tuple of (whether value is a token, value)
:type scanned: ```Dict[str, str]```
:param style: the style of docstring
:type style: ```Style```
:param emit_default_doc: Whether help/docstring should include 'With default' text
:type emit_default_doc: ```bool``
"""
known_tokens = getattr(TOKENS, Style.numpydoc.name)
_interpolate_defaults = partial(
interpolate_defaults, emit_default_doc=emit_default_doc
)
intermediate_repr.update(
{
"doc": scanned["doc"],
"params": list(
map(_interpolate_defaults, scanned.get(known_tokens[0], []))
),
"returns": _interpolate_defaults(scanned[return_tokens[0]])
if scanned.get(return_tokens[0])
else None,
}
)
def _parse_phase_rest(intermediate_repr, scanned, emit_default_doc, return_tokens):
"""
:param intermediate_repr: a dictionary of form
{
'name': ...,
'type': ...,
'doc': ...,
'params': [{'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }, ...],
'returns': {'name': ..., 'typ': ..., 'doc': ..., 'default': ..., 'required': ... }
}
:type intermediate_repr: ```dict```
:param scanned: List with each element a tuple of (whether value is a token, value)
:type scanned: ```List[Tuple[bool, str]]```
:param emit_default_doc: Whether help/docstring should include 'With default' text
:type emit_default_doc: ```bool``
"""
param = {}
for is_token, line in scanned:
if is_token is True:
if any(map(line.startswith, return_tokens)):
nxt_colon = line.find(":", 1)
val = line[nxt_colon + 1 :].strip()
if intermediate_repr["returns"] is None:
intermediate_repr["returns"] = {}
intermediate_repr["returns"].update(
interpolate_defaults(
dict(
(_set_param_values(line, val, return_tokens[-1]),),
name="return_type",
),
emit_default_doc=emit_default_doc,
)
)
else:
fst_space = line.find(" ")
nxt_colon = line.find(":", fst_space)
name = line[fst_space + 1 : nxt_colon]
if "name" in param and not param["name"] == name:
if not param["name"][0] == "*":
intermediate_repr["params"].append(param)
param = {}
val = line[nxt_colon + 1 :].strip()
param.update(dict((_set_param_values(line, val),), name=name))
param = interpolate_defaults(param, emit_default_doc=emit_default_doc)
elif not intermediate_repr["doc"]:
intermediate_repr["doc"] = line.strip()
if param:
# if param['name'] == 'return_type': intermediate_repr['returns'] = param
intermediate_repr["params"].append(param)
def _set_param_values(input_str, val, sw=":type"):
"""
Sets the typ or doc values properly.
:param val: The value (`sw` figures out what it means semantically)
:type val: ```str```
:param sw: Startswith condition
:type sw: ```str```
:return: Properly derived key and [potentially modified] value
:rtype: Tuple[Literal['doc', 'typ'], str]
"""
return (
("typ", (lambda v: "dict" if v.startswith("**") else v)(val.replace("```", "")))
if input_str.startswith(sw)
else ("doc", val)
)
__all__ = ["parse_docstring"]