xxx

#!/usr/bin/env python3
"""
usage: xxx [-l | --list]  [--branch] [--with <word>] [<path> [<path> ...]]
       xxx [-c | --count] [--branch] [--with <word>] [<path> [<path> ...]]
       xxx [-e | --edit]  [--branch] [--with <word>] [<path> [<path> ...]]
       xxx --version
       xxx --help

List, count, and edit XXX/TODO/FIXME remarks in a collection of files.

In a git repository, this command will only search files tracked by git,
limited by the paths given on the command-line.  The -b / --branch option will
cause this command to only include remarks added by the current branch.  Set
xxx.branch in your git config (per repo or globally) to automatically enable
the --branch option.

Outside a git repository, this command walks the filesystem starting with the
given paths.  If no paths are given, it starts from the current directory.  If
a git respository (a directory with a .git directory) is encountered during
this walk, only the files tracked by git in the repository are searched.

Paths matching any glob-style patterns listed in an ~/.xxxignore file are
ignored.

Output of --list is automatically paged through XXX_PAGER, PAGER, or less.

Written by Thomas Sibley <https://tsibley.net>.
"""

import argparse
import json
import re
import os
import subprocess
import shlex
import sys
from contextlib import redirect_stdout
from functools import partial
from hashlib import md5
from io import TextIOWrapper
from itertools import chain, takewhile, filterfalse
from pathlib import Path
from shlex import split as shellwords
from subprocess import Popen
from tempfile import NamedTemporaryFile
from textwrap import dedent
from typing import Callable, Iterable, List, NamedTuple


VERSION = '1.1.2'

EDITOR = shellwords(os.environ.get("XXX_EDITOR", "vim -q"))
PAGER  = shellwords(os.environ.get("XXX_PAGER",
                    os.environ.get("PAGER", "less -SRFXi")))

COLORIZED = sys.stdout.isatty()

TAG = re.compile("(" + "|".join(["XXX", "TODO", "FIXME"]) + ")")

COMMENT_START_SEQUENCE = [
    "#",    # Many shell-like languages
    "//",   # Many C-like languages
    "--",   # SQL
]


Prefix = NamedTuple("Prefix", [
    ("first", str),
    ("continued", str),
])

# Bump this if the structure of XXX below changes, to invalidate any caches.
XXX_VERSION = 1

XXX = NamedTuple("XXX", [
    ("file", Path),
    ("line_number", int),
    ("remark", List[str]),
    ("prefix", Prefix),
    ("spacing", str),
    ("context", List[str]),
])

PathMatcher = Callable[[Path], bool]


def MAIN(args):
    doc_paragraphs = __doc__.strip("\n").split("\n\n")

    parser = argparse.ArgumentParser(
        "xxx",
        usage           = ltrim("usage: ", doc_paragraphs[0]),
        description     = "\n\n".join(doc_paragraphs[1:-1]),
        epilog          = doc_paragraphs[-1],
        formatter_class = argparse.RawDescriptionHelpFormatter,
        add_help        = False)

    # Modes of operation
    modes = parser.add_mutually_exclusive_group()

    modes.add_argument(
        "-l", "--list",
        help    = "List remarks for the given paths",
        action  = "store_const",
        dest    = "mode",
        const   = LIST)

    modes.add_argument(
        "-j", "--json",
        help    = "List remarks for the given paths as JSON",
        action  = "store_const",
        dest    = "mode",
        const   = LIST_JSON)

    modes.add_argument(
        "-c", "--count",
        help    = "Count remarks for the given paths",
        action  = "store_const",
        dest    = "mode",
        const   = COUNT)

    modes.add_argument(
        "-e", "--edit",
        help    = "Edit remarks for the given paths using vim's quickfix list.  "
                  "Set XXX_EDITOR to use your preferred editor.",
        action  = "store_const",
        dest    = "mode",
        const   = EDIT)

    parser.set_defaults(mode = LIST)

    # Options
    parser.add_argument(
        "-w", "--with",
        help    = "Limit output of remarks to those with the given keyword.  "
                  "May be used more than once.",
        metavar = "<word>",
        action  = "append",
        dest    = "words")

    parser.add_argument(
        "-b", "--branch",
        help    = "Limit output of remarks to those added by the current git branch "
                  "since diverging from the given ref.  Defaults to master (or value "
                  "of xxx.branch config key) if no ref is given.",
        metavar = "<ref>",
        nargs   = "?")

    parser.add_argument(
        "--no-branch",
        help    = "Do not limit output of remarks to those added by the current git branch "
                  "since diverging from the given ref.  "
                  "Disables a previously given -b or --branch option "
                  "and reading of xxx.branch from git config.",
        dest    = "branch",
        action  = "store_false")

    parser.add_argument(
        "-A", "--after",
        help    = "Include <n> lines after a remark for context.",
        metavar = "<n>",
        type    = int,
        default = 0)

    modes.add_argument(
        "-h", "--help",
        help    = "Show this help",
        action  = "help")

    modes.add_argument(
        "-v", "--version",
        help    = "Show the version of %(prog)s",
        action  = "version",
        version = "%(prog)s " + VERSION)

    parser.add_argument(
        "paths",
        help    = "Limit output of remarks to those in the given paths.  "
                  "Defaults to searching the current directory.",
        metavar = "<path>",
        nargs   = "*")

    opts = parser.parse_args(args)

    # Default --branch from git config, if not specified on command line.
    if opts.branch is None:
        opts.branch = branch_flag_from_git_config()

    # Construct filter sets
    filter_words = wordset(opts.words or [])

    # XXX FIXME: This assumes we're in a git repo already.
    if opts.branch and git_rev_parse(opts.branch) != git_rev_parse("HEAD"):
        filter_lines = set(
            (filename, line)
                for filename, lines in git_lines_added(opts.branch)
                for line in lines)
    else:
        filter_lines = None

    # Find files
    paths = opts.paths or ["."]
    ignored = ignored_matcher()

    files = filterfalse(ignored, ls(paths))

    # Find XXXs
    xxxs = (
        xxx for xxx in grep(files, opts.after)

             if (not filter_words
                 or filter_words & wordset(xxx.remark))

            and (filter_lines is None
                 or (xxx.file, xxx.line_number) in filter_lines)
    )

    # Display what we found
    opts.mode(xxxs)


def LIST(xxxs):
    # Easier to put these here than require the termcolor library from outside
    # the standard library.
    green     = "\033[1;32m"  if COLORIZED else ""
    blue      = "\033[0;34m"  if COLORIZED else ""
    cyan      = "\033[0;36m"  if COLORIZED else ""
    on_yellow = "\033[0;103m" if COLORIZED else ""
    reset     = "\033[0m"     if COLORIZED else ""

    # Ensure the pager's environment includes LESS=RK, either by setting it or
    # appending it to an existing value, so less interprets color escapes and
    # handles interrupts gracefully (if it is the pager).
    env = os.environ.copy()
    env.setdefault("LESS", "")

    if "R" not in env["LESS"]:
        env["LESS"] += "R"

    if "K" not in env["LESS"]:
        env["LESS"] += "K"

    with Popen(PAGER, stdin = subprocess.PIPE, env = env) as pager, \
         TextIOWrapper(pager.stdin, encoding = "utf-8") as pager_stdin, \
         redirect_stdout(pager_stdin):

        printed = set()

        for xxx in xxxs:
            if xxx.file not in printed:
                printf("{green}{xxx.file}{reset}", locals())
                printed.add(xxx.file)

            # When we have context lines, post-process the remark and context
            # together to dedent them as a whole block so that variation within
            # the combined lines is preserved.  This also restores the prefix
            # and spacing to each line of the remark to preserve alignment.
            #
            # XXX TODO: This might be better implemented (maybe in grep()
            # even?) by ltrim()-ing some leading substring of xxx.prefix (e.g.
            # the part of xxx.prefix before the first COMMENT_START_SEQUENCE),
            # but that's fiddlier and this is easier.  Let's see if it's good
            # enough.
            if xxx.context:
                # Trailing remarks at lines' end include the prefix.  See
                # "Rescue single-line trailing XXX markers" below.
                if xxx.remark[0].startswith(xxx.prefix.first.lstrip()):
                    first_line = xxx.remark[0]
                else:
                    first_line = xxx.prefix.first + xxx.spacing + xxx.remark[0]

                remaining_lines = [
                    xxx.prefix.continued + xxx.spacing + l
                        for l in xxx.remark[1:] ]

                lines = dedent("\n".join(chain([first_line], remaining_lines, xxx.context))).split("\n")
            else:
                lines = xxx.remark

            for index, line in enumerate(lines):
                if index == 0:
                    line = TAG.sub(on_yellow + r"\1" + reset, line)
                    printf("{blue}{xxx.line_number}{cyan}:{reset} {line}", locals())
                else:
                    indent = " " * len(str(xxx.line_number))
                    printf("{indent}{cyan}:{reset} {line}", locals())

            print()


def LIST_JSON(xxxs):
    for xxx in xxxs:
        print(as_json(xxx._asdict()))


def COUNT(xxxs):
    print(len(list(xxxs)))


def EDIT(xxxs):
    errors = NamedTemporaryFile(
        prefix = "xxx-",
        suffix = ".err",
        mode = "w",
        encoding = "utf-8")

    for xxx in xxxs:
        printf("{xxx.file}:{xxx.line_number}: {xxx.remark[0]}", locals(), file = errors)

    errors.flush()

    os.execvp(EDITOR[0], [*EDITOR, errors.name])


def wordset(lines):
    return set(
        re.sub(r"\W", "", word.lower())
            for line in map(str.split, lines)
            for word in line
             if word
    )


def printf(string, mapping, **kwargs):
    print(string.format_map(mapping), **kwargs)


def match_remark_start(line):
    match = re.search(r"^(?P<prefix>.*?)(?P<spacing>\s*)\b(?P<remark>" + TAG.pattern + r"\b(?![\"']).*)$", line)

    return match.groupdict() if match else None


def grep(files, context_after = 0):
    def _grep(file, context_after):
        with file.open(encoding = "UTF-8") as fh:
            try:
                xxx = None

                for line_number, line in enumerate(fh, 1):
                    # When we're inside a remark…
                    if xxx is not None:

                        # …either the current line continues the remark…
                        if xxx.prefix and line.startswith(xxx.prefix.continued) and not match_remark_start(line):

                            # The prefix should always be removed, but the
                            # spacing may not be present for blank continuation
                            # lines which omit trailing whitespace.  Consider
                            # the example below where the end-of-line is marked
                            # by a ↵ character:
                            #
                            #    // XXX: foo bar↵
                            #    //↵
                            #    // more notes here↵
                            #
                            # The second line contains the prefix and should be
                            # part of the continued remark, but lacks the
                            # spacing.
                            #
                            # XXX TODO: Support for comment blocks, e.g. /* */
                            # and """ """, without middle leaders.
                            continued_remark = ltrim(xxx.spacing, ltrim(xxx.prefix.continued, line.rstrip("\n")))

                            # Skip the final non-content line in a common
                            # multi-line comment convention:
                            #
                            #       /* XXX TODO: foo bar baz
                            #        * bam bat.
                            #        * YMMV.
                            #        */
                            #
                            # by skipping if the line is (*/).  See also the
                            # handling of the start further below.
                            if xxx.prefix.first.endswith("/*") and line.rstrip().endswith("*/") and continued_remark == "/":
                                continue

                            xxx.remark.append(continued_remark)
                            continue

                        # …is following context for the remark…
                        elif context_after and len(xxx.context) < context_after and not match_remark_start(line):
                            xxx.context.append(line.rstrip("\n"))
                            continue

                        # …or ends the remark.
                        else:
                            yield xxx
                            xxx = None

                    # Not inside a remark, so let's look for a new one.
                    assert xxx is None

                    match = match_remark_start(line)

                    if match:
                        # See comment above, in the elif block, for why prefix
                        # and spacing are separate.
                        prefix  = match["prefix"]
                        spacing = match["spacing"]
                        remark  = match["remark"]

                        # Rescue single-line trailing XXX markers like:
                        #
                        #     foo = dubious_thing()     # FIXME
                        #     <-------- prefix --------->
                        #
                        #     foo = dubious_thing()  # revisit this choice TODO
                        #     <---------------- prefix ------------------>
                        #
                        #     # I'll get to this later... TODO
                        #     <-------- prefix --------->
                        #
                        # but not continuations like this:
                        #
                        #     # XXX TODO: foo bar baz
                        #     # bam bat
                        #     ^
                        #     prefix
                        #
                        # Trailing markers mean the whole line becomes the remark.
                        if any((c in prefix and prefix.strip() != c) for c in COMMENT_START_SEQUENCE):
                            remark = line.strip()

                        # Handle a common multi-line comment convention:
                        #
                        #       /* XXX TODO: foo bar baz
                        #        * bam bat.
                        #        * YMMV.
                        #        */
                        #
                        # by adjusting the prefix that must match for continued lines.
                        if prefix.endswith("/*"):
                            prefix_continued = rtrim("/*", prefix) + " *"
                        else:
                            prefix_continued = prefix

                        xxx = XXX(file, line_number, [remark], Prefix(prefix, prefix_continued), spacing, [])

                else:
                    # Yield any remark which runs up to EOF
                    if xxx is not None:
                        yield xxx
                        xxx = None

            except UnicodeDecodeError:
                # Likely a binary file.  It may be text encoded in something
                # other than UTF-8, but we have no way of knowing what it is
                # for sure.
                pass

    cache = Cache(context_after)

    for file in files:
        try:
            yield from cache[file]
        except KeyError:
            xxxs = list(_grep(file, context_after))
            cache[file] = xxxs
            yield from xxxs


class Cache:
    dir = Path.home() / ".cache/xxx"

    def __init__(self, context_after):
        self.context_after = context_after
        self.dir.mkdir(parents = False, exist_ok = True)

    def __getitem__(self, path: Path):
        key = self.key(path)

        if key is None:
            raise KeyError(path)

        try:
            with (self.dir / key).open("rb") as file:
                for line in file:
                    xxx = json.loads(line)
                    yield XXX(
                        Path(xxx["file"]),
                        xxx["line_number"],
                        xxx["remark"],
                        Prefix(*xxx["prefix"]),
                        xxx["spacing"],
                        xxx["context"])

        except FileNotFoundError:
            raise KeyError(path)

    def __setitem__(self, path: Path, xxxs: List[XXX]):
        key = self.key(path)

        if key is None:
            raise KeyError(path)

        with (self.dir / key).open("w") as file:
            for xxx in xxxs:
                print(as_json(xxx._asdict()), file = file)

    def key(self, path: Path):
        return f"{XXX_VERSION}-{self._digest(path)}-A{self.context_after}"

    def _digest(self, path: Path):
        # Performant approach borrowed, with modifications, from
        # hashlib.file_digest in Python 3.11.
        if not hasattr(self, "buf"):
            self.buf = bytearray(2**18)       # Reusable buffer to reduce allocations.
            self.view = memoryview(self.buf)

        digest = md5()

        with path.open("rb") as fh:
            while True:
                size = fh.readinto(self.buf)
                if size == 0:
                    break  # EOF
                digest.update(self.view[:size])

        return digest.hexdigest()


class JSONEncoder(json.JSONEncoder):
    def __init__(self, allow_nan = False, **kwargs):
        super().__init__(allow_nan = allow_nan, **kwargs)

    def default(self, value):
        if isinstance(value, Path):
            return str(value)

        return super().default(value)


as_json = JSONEncoder().encode


def ls(paths):
    try:
        yield from git_ls(limit_to = paths)
    except subprocess.CalledProcessError:
        yield from walk(paths or ["."])


def git_ls(repo = ".", limit_to = []):
    RepoPath = partial(Path, repo)

    for subpath in map(RepoPath, limit_to):
        assert subpath.exists(), "The path '" + str(subpath) + "' does not exist in the git repository."

    def text_files():
        for line in capture_output(["git", "-C", repo, "ls-files", "--eol", "--", *limit_to]):
            # --eol gives lines like
            #
            #    i/<eolinfo><SPACES>w/<eolinfo><SPACES>attr/<eolattr><SPACE*><TAB><file>
            #
            eol, path = line.split("\t", 1)

            # Ignore non-text files, either by Git's own intuition of the file
            # in the work dir or an explicitly set attribute.
            if "w/-text" in eol or "attr/-text" in eol:
                continue

            yield path

    yield from (
        subpath
            for subpath in map(RepoPath, text_files())
             if not subpath.is_dir()
            and subpath.exists())


def walk(paths):
    def abort(error):
        raise error

    for path in paths:
        for base, dirs, files in os.walk(path, onerror = abort, followlinks = True):
            if ".git" in dirs:
                # Prevent os.walk() from continuing down this tree.
                dirs.clear()

                # Instead, ask git to return indexed files.
                yield from git_ls(base)
            else:
                yield from (Path(base, file) for file in files)


def ignored_matcher():
    # XXX TODO: This should support walking up the chain of parents looking for
    # .xxxignore files more-specific to a directory.
    #   -trs, 23 May 2019

    xxxignore = Path.home() / ".xxxignore"
    ignored = {}

    if xxxignore.is_file():
        with xxxignore.open(encoding = "utf-8") as file:
            ignored = {
                line for line in map(str.strip, file)
                      if line and not line.startswith("#") }

    return path_matcher(ignored)


def path_matcher(patterns: Iterable[str]) -> PathMatcher:
    """
    Generate a function which matches a Path object against the list of glob
    *patterns*.
    """
    def matches(path: Path) -> bool:
        return any(map(path.match, patterns))

    return matches


def git_lines_added(since_ref = "master"):
    cwd_prefix = capture_output(["git", "rev-parse", "--show-prefix"])[0]
    merge_base = capture_output(["git", "merge-base", since_ref, "HEAD"])[0]

    HUNK_HEADER = r'^@@ -\d+(?:,\d+)? \+(?P<start>\d+)(?:,(?P<length>\d+))?'

    filename = None

    for line in capture_output(["git", "diff", "--no-ext-diff", "-U0", merge_base, "--", "."]):
        if line.startswith("+++ b/"):
            filename = Path(ltrim("+++ b/", line)).relative_to(cwd_prefix)

        elif filename and line.startswith("@@"):
            match = re.search(HUNK_HEADER, line, re.ASCII)
            assert match, "Line starting with @@ didn't match hunk header pattern"

            # A missing length (None) means a length of 1, i.e. the ending line
            # is the same as the starting line.  A length of 0 means the hunk
            # is a pure deletion.
            start  = int(match.group("start"))
            length = int(match.group("length") or 1)

            if length:
                # insertion (+) filename, line interval [start, end)
                yield (filename, range(start, start + length))


def git_rev_parse(rev):
    return capture_output(["git", "rev-parse", rev])[0]


def branch_flag_from_git_config():
    try:
        # If it's a boolean, then git can canonicalize for us.
        branch = capture_output(["git", "config", "--bool", "xxx.branch"])[0]
    except:
        try:
            # If it's a branch name, then canonicalization fails.
            branch = capture_output(["git", "config", "xxx.branch"])[0]
        except:
            # Config isn't present.
            branch = None

    # equivalent to passing --branch
    if branch == "true":
        return "master"

    # equivalent to passing --branch=…
    elif branch:
        return branch

    # unable to get value
    else:
        return None


def capture_output(argv):
    """
    Run the command specified by the argument list and return a list of output
    lines.

    This wrapper around subprocess.run() exists because its own capture_output
    parameter wasn't added until Python 3.7, and I'm aiming for compat with 3.5.
    """
    result = subprocess.run(
        argv,
        stdout = subprocess.PIPE,
        stderr = subprocess.DEVNULL,
        check  = True)

    return result.stdout.decode("utf-8").splitlines()


def ltrim(prefix, string):
    return re.sub('^' + re.escape(prefix), '', string)

def rtrim(suffix, string):
    return re.sub(re.escape(suffix) + '$', '', string)


if __name__ == "__main__":
    try:
        import sys
        MAIN(sys.argv[1:])

    except BrokenPipeError:
        # From https://docs.python.org/3/library/signal.html#note-on-sigpipe:
        #
        #    Python flushes standard streams on exit.  Redirect remaining output
        #    to /dev/null to avoid another BrokenPipeError at shutdown.
        #
        devnull = os.open(os.devnull, os.O_WRONLY)
        os.dup2(devnull, sys.stdout.fileno())

    except KeyboardInterrupt:
        # Print a newline to leave cleaner console output
        print()