Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scripts/count_token_references.py to check for "unicorn" tokens. #1819

Merged
merged 1 commit into from Jun 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Expand Up @@ -24,6 +24,8 @@ check:
@pyflakes pygments | grep -v 'but unused' || true
@$(PYTHON) scripts/check_sources.py -i build -i dist -i pygments/lexers/_mapping.py \
-i docs/build -i pygments/formatters/_mapping.py -i pygments/unistring.py
@$(PYTHON) scripts/count_token_references.py --minfiles=1 --maxfiles=1 \
--minlines=1 --maxlines=3 --subtoken

clean: clean-pyc
-rm -rf doc/_build build Pygments.egg-info
Expand Down
267 changes: 267 additions & 0 deletions scripts/count_token_references.py
@@ -0,0 +1,267 @@
#!/usr/bin/env python
"""
Count number of references to tokens in lexer source
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:program:`count_token_references` counts how many references to all existing
tokens it can find by "grepping" the the source code of the lexers. This can
be used to find typos in token names, as those tokens are only used by one lexer.

:program:`count_token_references` supports the following options:

.. program:: count_token_references

.. option:: -v, --verbose
This gives output while the script is collecting information.

.. option:: --minfiles <COUNT>
Only report about tokens that are referenced in at least this many lexer
source files (default 1).

.. option:: --maxfiles <COUNT>
Only report about tokens that are referenced in at most this many lexer
source files (default 1).

.. option:: --minlines <COUNT>
Only report about tokens that are referenced in at least this many lexer
source lines (default 1).

.. option:: --maxlines <COUNT>
Only report about tokens that are referenced in at most this many lexer
source lines (default 10).

.. option:: -s, --subtokens
When ``--subtoken`` is given each token is also counted for each of its
parent tokens. I.e. if we have 10 occurences of the token
``Token.Literal.Number.Integer`` and 10 occurences of the token
``Token.Literal.Number.Hex`` but none for ``Token.Literal.Number``, with
``--subtoken`` ``Token.Literal.Number`` would be counted as having
20 references.
"""

import sys, argparse, re, pathlib

from pygments import token, lexers


def lookup_all_lexers():
"""
Iterate through all lexers and fetch them.
This should create all tokens that any of the lexers produce.
"""
count = 0
for (name, aliases, patterns, mimetypes) in lexers.get_all_lexers():
for a in aliases:
l = lexers.get_lexer_by_name(a)
break
else:
for p in patterns:
l = lexers.get_lexer_for_filename(p)
break
else:
for m in mimetypes:
l = lexers.get_lexer_for_mimetype(m)
break
count += 1
return count


def fetch_lexer_sources():
"""
Return the source code of all lexers as a dictionary, mapping filenames
to a list of lines.
"""
lexer_dir = (pathlib.Path(__file__).parent / "../pygments/lexers").resolve()
lexer_sources = {fn: fn.read_text().splitlines(keepends=False) for fn in lexer_dir.glob("*.py")}
return lexer_sources


def sub_tokens(token):
"""
Generator that yields a token and all of its sub-tokens recursively.
"""
yield token
for subtoken in token.subtypes:
yield from sub_tokens(subtoken)


class FileCount:
"""
Stores information about line numbers in a file.

This is used to store from which lines in a files a certain token is
referenced.
"""
def __init__(self, filename):
self.filename = filename
self.lines = []

def __str__(self):
if len(self.lines) > 3:
lines = ", ".join(f"{line:,}" for line in self.lines[:5])
lines = f"{lines}, ... ({len(lines):,} lines)"
else:
lines = ", ".join(f"{line:,}" for line in self.lines)
return f"{self.filename.name}[{lines}]"

def add(self, linenumber):
self.lines.append(linenumber)

def count_lines(self):
return len(self.lines)


class TokenCount:
"""
Stores information about a token and in which files it is referenced.
"""
def __init__(self, token):
self.token = token
self.files = {}

def add(self, filename, linenumber):
if filename not in self.files:
self.files[filename] = FileCount(filename)
self.files[filename].add(linenumber)

def __str__(self):
if len(self.files) > 3:
files = []
for (i, filecount) in enumerate(self.files.values()):
files.append(str(filecount))
if i >= 5:
break
files = ", ".join(files) + f", ... ({len(self.files):,} files)"
else:
files = ", ".join(str(filecount) for filecount in self.files.values())
return f"{self.count_files():,} files, {self.count_lines():,} locations: {files}"

def count_files(self):
return len(self.files)

def count_lines(self):
return sum(fc.count_lines() for fc in self.files.values())


def find_token_references(lexer_sources, args):
"""
Find all references to all tokens in the source code of all lexers.

Note that this can't be 100% reliable, as it searches the source code for
certain patterns: It searches for the last two components of a token name,
i.e. to find references to the token ``Token.Literal.Number.Integer.Long``
it searches for the regular expression ``\\bInteger.Long\\b``. This
won't work reliably for top level token like ``Token.String`` since this
is often referred to as ``String``, but searching for ``\\bString\\b``
yields to many false positives.
"""

# Maps token to :class:`TokenCount` objects.
token_references = {}

# Search for each token in each lexer source file and record in which file
# and in which line they are referenced
for t in sub_tokens(token.Token):
parts = list(t)[-2:]
if len(parts) == 0:
name = "Token"
elif len(parts) == 1:
name = f"Token.{parts[0]}"
else:
name = ".".join(parts)

token_references[t] = tokencount = TokenCount(t)

if name != "Token":
pattern = re.compile(f"\\b{name}\\b")

for (filename, sourcelines) in lexer_sources.items():
for (i, line) in enumerate(sourcelines, 1):
if pattern.search(line) is not None:
tokencount.add(filename, i)
if args.subtoken:
t2 = t
while t2 is not token.Token:
t2 = t2.parent
tokencount2 = token_references[t2]
tokencount2.add(filename, i)

return token_references


def print_result(token_references, args):
def key(item):
return (item[1].count_files(), item[1].count_lines())

for (token, locations) in sorted(token_references.items(), key=key):
if args.minfiles <= locations.count_files() <= args.maxfiles and \
args.minlines <= locations.count_lines() <= args.maxlines:
print(f"{token}: {locations}")


def main(args=None):
p = argparse.ArgumentParser(description="Count how often each token is used by the lexers")
p.add_argument(
"-v", "--verbose",
dest="verbose", help="Give more output.",
default=False, action="store_true"
)
p.add_argument(
"--minfiles",
dest="minfiles", metavar="COUNT", type=int,
help="Report all tokens referenced by at least COUNT lexer source files (default %(default)s)",
default=1
)
p.add_argument(
"--maxfiles",
dest="maxfiles", metavar="COUNT", type=int,
help="Report all tokens referenced by at most COUNT lexer source files (default %(default)s)",
default=1
)
p.add_argument(
"--minlines",
dest="minlines", metavar="COUNT", type=int,
help="Report all tokens referenced by at least COUNT lexer source lines (default %(default)s)",
default=1
)
p.add_argument(
"--maxlines",
dest="maxlines", metavar="COUNT", type=int,
help="Report all tokens referenced by at most COUNT lexer source lines (default %(default)s)",
default=10
)
p.add_argument(
"-s", "--subtoken",
dest="subtoken",
help="Include count of references to subtokens in the count for each token (default %(default)s)",
default=False, action="store_true"
)

args = p.parse_args(args)

if args.verbose:
print("Looking up all lexers ... ", end="", flush=True)
count = lookup_all_lexers()
if args.verbose:
print(f"found {count:,} lexers")

if args.verbose:
print("Fetching lexer source code ... ", end="", flush=True)
lexer_sources = fetch_lexer_sources()
if args.verbose:
print(f"found {len(lexer_sources):,} lexer source files")

if args.verbose:
print("Finding token references ... ", end="", flush=True)
token_references = find_token_references(lexer_sources, args)
if args.verbose:
print(f"found references to {len(token_references):,} tokens")

if args.verbose:
print()
print("Result:")
print_result(token_references, args)


if __name__ == "__main__":
sys.exit(main())