tests/func/plots/test_show.py

import os

import pytest

from dvc.cli import main
from dvc.dvcfile import PIPELINE_FILE
from dvc.exceptions import OverlappingOutputPathsError
from dvc.repo import Repo
from dvc.repo.plots import PlotMetricTypeError
from dvc.utils import onerror_collect
from dvc.utils.fs import remove
from dvc.utils.serialize import EncodingError, YAMLFileCorruptedError
from tests.utils.plots import get_plot


def test_show_targets(tmp_dir, dvc):
    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]
    (tmp_dir / "metric.json").dump_json(metric, sort_keys=True)

    plots = dvc.plots.show(targets=["metric.json"])
    assert get_plot(plots, "workspace", file="metric.json") == metric

    plots = dvc.plots.show(targets=(tmp_dir / "metric.json").fs_path)
    assert get_plot(plots, "workspace", file="metric.json") == metric


def test_plot_cache_missing(tmp_dir, scm, dvc, caplog, run_copy_metrics):
    metric1 = [{"y": 2}, {"y": 3}]
    (tmp_dir / "metric_t.json").dump_json(metric1, sort_keys=True)
    run_copy_metrics(
        "metric_t.json",
        "metric.json",
        plots=["metric.json"],
        commit="there is metric",
    )
    scm.tag("v1")

    # Make a different plot and then remove its datafile
    metric2 = [{"y": 3}, {"y": 4}]
    (tmp_dir / "metric_t.json").dump_json(metric2, sort_keys=True)
    stage = run_copy_metrics(
        "metric_t.json",
        "metric.json",
        plots=["metric.json"],
        commit="there is an another metric",
    )
    scm.tag("v2")
    remove(stage.outs[0].fspath)
    remove(stage.outs[0].cache_path)

    plots_data = dvc.plots.show(revs=["v1", "v2"], targets=["metric.json"])

    assert get_plot(plots_data, "v1", file="metric.json") == metric1
    assert isinstance(
        get_plot(plots_data, "v2", file="metric.json", endkey="error"),
        FileNotFoundError,
    )


def test_plot_wrong_metric_type(tmp_dir, scm, dvc, run_copy_metrics):
    tmp_dir.gen("metric_t.txt", "some text")
    run_copy_metrics(
        "metric_t.txt",
        "metric.txt",
        plots_no_cache=["metric.txt"],
        commit="add text metric",
    )

    result = dvc.plots.show(targets=["metric.txt"], onerror=onerror_collect)
    assert isinstance(
        get_plot(result, "workspace", file="metric.txt", endkey="error"),
        PlotMetricTypeError,
    )


@pytest.mark.parametrize("use_dvc", [True, False])
def test_show_non_plot(tmp_dir, scm, use_dvc):
    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]
    (tmp_dir / "metric.json").dump_json(metric, sort_keys=True)

    if use_dvc:
        dvc = Repo.init()
    else:
        dvc = Repo(uninitialized=True)

    plots = dvc.plots.show(targets=["metric.json"])

    assert get_plot(plots, "workspace", file="metric.json") == metric


def test_show_non_plot_and_plot_with_params(
    tmp_dir, scm, dvc, run_copy_metrics
):
    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]
    (tmp_dir / "metric.json").dump_json(metric, sort_keys=True)
    run_copy_metrics(
        "metric.json",
        "metric2.json",
        plots_no_cache=["metric2.json"],
        name="train",
    )
    props = {"title": "TITLE"}
    dvc.plots.modify("metric2.json", props=props)

    result = dvc.plots.show(targets=["metric.json", "metric2.json"])

    assert get_plot(result, "workspace", file="metric.json") == metric
    assert get_plot(result, "workspace", file="metric2.json") == metric
    assert (
        get_plot(result, "workspace", file="metric2.json", endkey="props")
        == props
    )


def test_show_from_subdir(tmp_dir, dvc, capsys):
    subdir = tmp_dir / "subdir"

    subdir.mkdir()
    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]
    (subdir / "metric.json").dump_json(metric, sort_keys=True)

    with subdir.chdir():
        assert main(["plots", "show", "metric.json"]) == 0

    out, _ = capsys.readouterr()
    assert subdir.as_uri() in out
    assert (subdir / "dvc_plots").is_dir()
    assert (subdir / "dvc_plots" / "index.html").is_file()


def test_plots_show_non_existing(tmp_dir, dvc, caplog):
    result = dvc.plots.show(targets=["plot.json"])
    assert isinstance(
        get_plot(result, "workspace", file="plot.json", endkey="error"),
        FileNotFoundError,
    )

    assert "'plot.json' was not found in current workspace." in caplog.text


@pytest.mark.parametrize("clear_before_run", [True, False])
def test_plots_show_overlap(tmp_dir, dvc, run_copy_metrics, clear_before_run):
    data_dir = tmp_dir / "data"
    data_dir.mkdir()

    (data_dir / "m1_temp.yaml").dump({"a": {"b": {"c": 2, "d": 1}}})
    run_copy_metrics(
        str(data_dir / "m1_temp.yaml"),
        str(data_dir / "m1.yaml"),
        single_stage=False,
        commit="add m1",
        name="cp-m1",
        plots=[str(data_dir / "m1.yaml")],
    )
    with (tmp_dir / "dvc.yaml").modify() as d:
        # trying to make an output overlaps error
        d["stages"]["corrupted-stage"] = {
            "cmd": "mkdir data",
            "outs": ["data"],
        }

    # running by clearing and not clearing stuffs
    # so as it works even for optimized cases
    if clear_before_run:
        remove(data_dir)
        remove(dvc.odb.local.cache_dir)

    dvc._reset()

    result = dvc.plots.show(onerror=onerror_collect)
    assert isinstance(
        get_plot(result, "workspace", endkey="error"),
        OverlappingOutputPathsError,
    )


def test_dir_plots(tmp_dir, dvc, run_copy_metrics):
    subdir = tmp_dir / "subdir"
    subdir.mkdir()

    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]

    fname = "file.json"
    (tmp_dir / fname).dump_json(metric, sort_keys=True)

    p1 = "subdir/p1.json"
    p2 = "subdir/p2.json"
    tmp_dir.dvc.run(
        cmd=(
            f"mkdir subdir && python copy.py {fname} {p1} && "
            f"python copy.py {fname} {p2}"
        ),
        deps=[fname],
        single_stage=False,
        plots=["subdir"],
        name="copy_double",
    )
    props = {"title": "TITLE"}
    dvc.plots.modify("subdir", props)

    result = dvc.plots.show()

    assert set(get_plot(result, "workspace")) == {p1, p2}
    assert get_plot(result, "workspace", typ="definitions", file="") == {
        p1: props,
        p2: props,
    }


def test_ignore_parsing_error(tmp_dir, dvc, run_copy_metrics):
    with open("file", "wb", encoding=None) as fobj:
        fobj.write(b"\xc1")

    run_copy_metrics("file", "plot_file.json", plots=["plot_file.json"])
    result = dvc.plots.show(onerror=onerror_collect)

    assert isinstance(
        get_plot(result, "workspace", file="plot_file.json", endkey="error"),
        EncodingError,
    )


@pytest.mark.parametrize(
    "file,path_kwargs",
    (
        (PIPELINE_FILE, {"revision": "workspace", "endkey": "error"}),
        (
            "plot.yaml",
            {"revision": "workspace", "file": "plot.yaml", "endkey": "error"},
        ),
    ),
)
def test_log_errors(
    tmp_dir, scm, dvc, run_copy_metrics, file, path_kwargs, capsys
):
    metric = [{"val": 2}, {"val": 3}]
    (tmp_dir / "metric_t.yaml").dump(metric)
    run_copy_metrics(
        "metric_t.yaml",
        "plot.yaml",
        plots=["plot.yaml"],
        single_stage=False,
        name="train",
    )
    scm.tag("v1")

    with open(file, "a", encoding="utf-8") as fd:
        fd.write("\nMALFORMED!")

    result = dvc.plots.show(onerror=onerror_collect)
    _, error = capsys.readouterr()

    assert isinstance(get_plot(result, **path_kwargs), YAMLFileCorruptedError)
    assert (
        "DVC failed to load some plots for following revisions: 'workspace'."
        in error
    )


def test_plots_binary(tmp_dir, scm, dvc, run_copy_metrics, custom_template):
    with open("image.jpg", "wb") as fd:
        fd.write(b"content")

    dvc.add(["image.jpg"])
    run_copy_metrics(
        "image.jpg",
        "plot.jpg",
        commit="run training",
        plots=["plot.jpg"],
        name="s2",
        single_stage=False,
    )

    scm.add(["dvc.yaml", "dvc.lock"])
    scm.commit("initial")

    scm.tag("v1")

    with open("plot.jpg", "wb") as fd:
        fd.write(b"content2")

    result = dvc.plots.show(revs=["v1", "workspace"])
    assert get_plot(result, "v1", file="plot.jpg") == b"content"
    assert get_plot(result, "workspace", file="plot.jpg") == b"content2"


def test_collect_non_existing_dir(tmp_dir, dvc, run_copy_metrics):
    subdir = tmp_dir / "subdir"
    subdir.mkdir()

    metric = [{"first_val": 100, "val": 2}, {"first_val": 200, "val": 3}]
    subdir_metric = [{"y": 101, "x": 3}, {"y": 202, "x": 4}]

    pname = "source.json"
    (tmp_dir / pname).dump_json(metric, sort_keys=True)

    sname = "subdir_source.json"
    (tmp_dir / sname).dump_json(subdir_metric, sort_keys=True)

    p1 = os.path.join("subdir", "p1.json")
    p2 = os.path.join("subdir", "p2.json")
    subdir_stage = tmp_dir.dvc.run(
        cmd=(
            f"mkdir subdir && python copy.py {sname} {p1} && "
            f"python copy.py {sname} {p2}"
        ),
        deps=[sname],
        single_stage=False,
        plots=["subdir"],
        name="copy_double",
    )

    run_copy_metrics(
        pname,
        "plot.json",
        plots=["plot.json"],
        commit="there is metric",
    )

    remove(subdir_stage.outs[0].cache_path)
    remove(subdir_stage.outs[0].fs_path)

    result = dvc.plots.show()
    assert get_plot(
        result, "workspace", typ="definitions", file="", endkey="error"
    )
    # make sure others gets loaded
    assert get_plot(result, "workspace", file="plot.json") == metric


@pytest.mark.parametrize(
    "plot_config,expected_datafiles",
    [
        # TODO - enable providing data files for x
        # (
        #     {
        #         "comparison": {
        #             "x": {"data1.json": "a"},
        #             "y": {"sub/dir/data2.json": "b"},
        #         }
        #     },
        #     ["data1.json", os.path.join("sub", "dir", "data2.json")],
        # ),
        (
            {"data1.json": {"x": "c", "y": "a", "title": "File as key test"}},
            ["data1.json"],
        ),
        (
            {
                "infer_data_from_y": {
                    "x": "a",
                    "y": {"data1.json": "b", "sub/dir/data2.json": "c"},
                }
            },
            ["data1.json", os.path.join("sub", "dir", "data2.json")],
        ),
    ],
)
@pytest.mark.parametrize("separate_config", [True, False])
def test_load_from_config(
    tmp_dir,
    dvc,
    plot_config,
    expected_datafiles,
    separate_config,
    run_copy_metrics,
):
    data = {
        "data1.json": [
            {"a": 1, "b": 0.1, "c": 0.01},
            {"a": 2, "b": 0.2, "c": 0.02},
        ],
        os.path.join("sub", "dir", "data.json"): [
            {"a": 6, "b": 0.6, "c": 0.06},
            {"a": 7, "b": 0.7, "c": 0.07},
        ],
    }

    for filename, content in data.items():
        dirname = os.path.dirname(filename)
        if dirname:
            os.makedirs(dirname)
        (tmp_dir / filename).dump_json(content, sort_keys=True)

    config_files = None
    if separate_config:
        (tmp_dir / "plot_config.json").dump_json(plot_config, sort_keys=True)
        config_file = "plot_config.json"
        config_files = {config_file}
    else:
        # TODO we need that to create any stage, as dvc.yaml plots
        #     collections bases on existing stages - fix collection
        run_copy_metrics("data1.json", "copy.json", name="train")

        from dvc.utils.serialize import modify_yaml

        config_file = "dvc.yaml"
        with modify_yaml(config_file) as dvcfile_content:
            dvcfile_content["plots"] = plot_config

    result = dvc.plots.show(config_files=config_files)

    assert plot_config == get_plot(
        result, "workspace", typ="definitions", file=config_file
    )

    for filename, content in data.items():
        if filename in expected_datafiles:
            assert content == get_plot(result, "workspace", file=filename)
        else:
            assert filename not in get_plot(result, "workspace")