From 0c1d099f87a883e52c42d3fd1f1052ad3967e647 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:32:16 +0200
Subject: [PATCH] Deprecate metrics (#4739)

* Deprecate public metric functions

* Test metric deprecation warnings

* Deprecate metrics in docs

* Remove mentions to metrics in docs and README

* Deprecate internal metric functions/classes

* Warn metric deprecation only once

* Deprecate Metric class

* Support deprecating __init__ method for subclassed classes

* Move deprecated decorator to __init__ class method

* Update deprecation message in docs

* Remove mentions to metrics in docstring/README

* Remove new_metric_script template

* Skip metric tests

* Remove metrics from code quality check

* Remove metric test requirements

* Add rouge_score test requirement needed by bigbench

* Remove metrics additional tests requirements

* Remove test requirements only used by metrics

* Address requested changes

* Update deprecation version after latest release

* Remove repeated comment

* Give hint to switch to evaluate

* Fix minor details

* Revert removal of metrics CI tests

* Revert removal of metrics CI tests

* Fix style

* Mock emitted_deprecation_warnings to test warnings
---
 README.md                                     |  14 +-
 docs/source/about_metrics.mdx                 |   6 +
 docs/source/how_to_metrics.mdx                |   2 +-
 docs/source/index.mdx                         |   4 +-
 docs/source/loading.mdx                       |   2 +-
 docs/source/metrics.mdx                       |   2 +-
 .../package_reference/loading_methods.mdx     |   6 +
 setup.py                                      |  11 +-
 src/datasets/inspect.py                       |  21 ++-
 src/datasets/load.py                          | 167 +++++++++++-------
 src/datasets/metric.py                        |  15 ++
 src/datasets/utils/deprecation_utils.py       |   9 +-
 src/datasets/utils/logging.py                 |   2 +-
 templates/new_metric_script.py                | 104 -----------
 tests/test_warnings.py                        |  34 ++++
 15 files changed, 207 insertions(+), 192 deletions(-)
 delete mode 100644 templates/new_metric_script.py
 create mode 100644 tests/test_warnings.py
diff --git a/README.md b/README.md
index ed86b2f2b0a..b6b39f794fb 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@
     <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/datasets/main/docs/source/imgs/course_banner.png"></a>
 </h3>
 
-🤗 Datasets also provides access to +40 evaluation metrics and is designed to let the community easily add and share new datasets and evaluation metrics. 
+🤗 Datasets is designed to let the community easily add and share new datasets.
 
 🤗 Datasets has many additional interesting features:
 
@@ -85,15 +85,13 @@ For more details on using the library with NumPy, pandas, PyTorch or TensorFlow,
 
 - `datasets.list_datasets()` to list the available datasets
 - `datasets.load_dataset(dataset_name, **kwargs)` to instantiate a dataset
-- `datasets.list_metrics()` to list the available metrics
-- `datasets.load_metric(metric_name, **kwargs)` to instantiate a metric
 
 This library can be used for text/image/audio/etc. datasets. Here is an example to load a text dataset:
 
 Here is a quick example:
 
 ```python
-from datasets import list_datasets, load_dataset, list_metrics, load_metric
+from datasets import list_datasets, load_dataset
 
 # Print all the available datasets
 print(list_datasets())
@@ -102,12 +100,6 @@ print(list_datasets())
 squad_dataset = load_dataset('squad')
 print(squad_dataset['train'][0])
 
-# List all the available metrics
-print(list_metrics())
-
-# Load a metric
-squad_metric = load_metric('squad')
-
 # Process the dataset - add a column with the length of the context texts
 dataset_with_length = squad_dataset.map(lambda x: {"length": len(x["context"])})
 
@@ -150,7 +142,7 @@ If you are familiar with the great TensorFlow Datasets, here are the main differ
 
 # Disclaimers
 
-Similar to TensorFlow Datasets, 🤗 Datasets is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use them. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
+Similar to TensorFlow Datasets, 🤗 Datasets is a utility library that downloads and prepares public datasets. We do not host or distribute most of these datasets, vouch for their quality or fairness, or claim that you have license to use them. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
 
 If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a [GitHub issue](https://github.com/huggingface/datasets/issues/new). Thanks for your contribution to the ML community!
 
diff --git a/docs/source/about_metrics.mdx b/docs/source/about_metrics.mdx
index f47843d6303..2e5b722f988 100644
--- a/docs/source/about_metrics.mdx
+++ b/docs/source/about_metrics.mdx
@@ -1,5 +1,11 @@
 # All about metrics
 
+<Tip warning={true}>
+
+Metrics is deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at the library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, you can find more tools for evaluating models and datasets.
+
+</Tip>
+
 🤗 Datasets provides access to a wide range of NLP metrics. You can load metrics associated with benchmark datasets like GLUE or SQuAD, and complex metrics like BLEURT or BERTScore, with a single command: [`load_metric`]. Once you've loaded a metric, easily compute and evaluate a model's performance.
 
 ## ELI5: `load_metric`
diff --git a/docs/source/how_to_metrics.mdx b/docs/source/how_to_metrics.mdx
index 2023193c145..157214e5559 100644
--- a/docs/source/how_to_metrics.mdx
+++ b/docs/source/how_to_metrics.mdx
@@ -2,7 +2,7 @@
 
 <Tip warning={true}>
 
-Metrics will soon be deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at our newest library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, we've also added more tools for evaluating models and datasets.
+Metrics is deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at the library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, you can find more tools for evaluating models and datasets.
 
 </Tip>
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 3432ca76404..993ffe1a4e2 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -2,9 +2,9 @@
 
 <img class="float-left !m-0 !border-0 !dark:border-0 !shadow-none !max-w-lg w-[150px]" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasets_logo.png"/>
 
-🤗 Datasets is a library for easily accessing and sharing datasets, and evaluation metrics for Natural Language Processing (NLP), computer vision, and audio tasks.
+🤗 Datasets is a library for easily accessing and sharing datasets for Natural Language Processing (NLP), computer vision, and audio tasks.
 
-Load a dataset in a single line of code, and use our powerful data processing methods to quickly get your dataset ready for training in a deep learning model. Backed by the Apache Arrow format, process large datasets with zero-copy reads without any memory constraints for optimal speed and efficiency. We also feature a deep integration with the [Hugging Face Hub](https://huggingface.co/datasets), allowing you to easily load and share a dataset with the wider NLP community. There are currently over 2658 datasets, and more than 34 metrics available. 
+Load a dataset in a single line of code, and use our powerful data processing methods to quickly get your dataset ready for training in a deep learning model. Backed by the Apache Arrow format, process large datasets with zero-copy reads without any memory constraints for optimal speed and efficiency. We also feature a deep integration with the [Hugging Face Hub](https://huggingface.co/datasets), allowing you to easily load and share a dataset with the wider NLP community.
 
 Find your dataset today on the [Hugging Face Hub](https://huggingface.co/datasets), and take an in-depth look inside of it with the live viewer.
 
diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx
index 6bebb16ae84..eef4d3290f0 100644
--- a/docs/source/loading.mdx
+++ b/docs/source/loading.mdx
@@ -340,7 +340,7 @@ Now when you look at your dataset features, you can see it uses the custom label
 
 <Tip warning={true}>
 
-Metrics will soon be deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at our newest library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, we've also added more tools for evaluating models and datasets.
+Metrics is deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at the library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, you can find more tools for evaluating models and datasets.
 
 </Tip>
 
diff --git a/docs/source/metrics.mdx b/docs/source/metrics.mdx
index 684378db2b1..3342fa847c4 100644
--- a/docs/source/metrics.mdx
+++ b/docs/source/metrics.mdx
@@ -2,7 +2,7 @@
 
 <Tip warning={true}>
 
-Metrics will soon be deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at our newest library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, we've also added more tools for evaluating models and datasets.
+Metrics is deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at the library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, you can find more tools for evaluating models and datasets.
 
 </Tip>
 
diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx
index cdae9ed21cb..66508e3dbf1 100644
--- a/docs/source/package_reference/loading_methods.mdx
+++ b/docs/source/package_reference/loading_methods.mdx
@@ -22,6 +22,12 @@ Methods for listing and loading datasets and metrics:
 
 ## Metrics
 
+<Tip warning={true}>
+
+Metrics is deprecated in 🤗 Datasets. To learn more about how to use metrics, take a look at the library 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index)! In addition to metrics, you can find more tools for evaluating models and datasets.
+
+</Tip>
+
 [[autodoc]] datasets.list_metrics
 
 [[autodoc]] datasets.load_metric
diff --git a/setup.py b/setup.py
index 5b6db65f0c7..30cfb29fd28 100644
--- a/setup.py
+++ b/setup.py
@@ -120,6 +120,7 @@
     "botocore>=1.22.8",  # to be compatible with aiobotocore and boto3
     "faiss-cpu>=1.6.4",
     "fsspec[s3]",
+    "lz4",
     "moto[s3,server]==2.0.4",
     "rarfile>=4.0",
     "s3fs>=2021.11.1",  # aligned with fsspec[http]>=2021.11.1
@@ -132,29 +133,29 @@
     "bs4",
     "conllu",
     "h5py",
-    "langdetect",
     "lxml",
-    "lz4",
     "mwparserfromhell",
-    "nltk",
     "openpyxl",
     "py7zr",
-    "tldextract",
     "zstandard",
     "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz",
     "sentencepiece",  # bigbench requires t5 which requires seqio which requires sentencepiece
+    "rouge_score<0.0.7",  # required by bigbench: bigbench.api.util.bb_utils > t5.evaluation.metrics > rouge_score
     "sacremoses",
     # metrics dependencies
     "bert_score>=0.3.6",
     "jiwer",
+    "langdetect",
     "mauve-text",
-    "rouge_score<0.0.7",
+    "nltk",
+    # "rouge_score<0.0.7",  # also required by bigbench
     "sacrebleu",
     "sacremoses",
     "scikit-learn",
     "scipy",
     "sentencepiece",  # for bleurt
     "seqeval",
+    "tldextract",
     # to speed up pip backtracking
     "toml>=0.10.1",
     "requests_file>=1.5.1",
diff --git a/src/datasets/inspect.py b/src/datasets/inspect.py
index 2e7e702e766..9f62076af95 100644
--- a/src/datasets/inspect.py
+++ b/src/datasets/inspect.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # Lint as: python3
-""" List and inspect datasets and metrics."""
+""" List and inspect datasets."""
 
 import inspect
 import os
@@ -28,6 +28,7 @@
 from .download.streaming_download_manager import StreamingDownloadManager
 from .info import DatasetInfo
 from .load import dataset_module_factory, import_main_class, load_dataset_builder, metric_module_factory
+from .utils.deprecation_utils import deprecated
 from .utils.file_utils import relative_to_absolute_path
 from .utils.logging import get_logger
 from .utils.version import Version
@@ -70,9 +71,18 @@ def list_datasets(with_community_datasets=True, with_details=False):
     return datasets
 
 
+@deprecated(
+    "Use 'evaluate.list_evaluation_modules' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate"
+)
 def list_metrics(with_community_metrics=True, with_details=False):
     """List all the metrics script available on the Hugging Face Hub.
 
+    <Deprecated version="2.5.0">
+
+    Use `evaluate.list_evaluation_modules` instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+
     Args:
         with_community_metrics (:obj:`bool`, optional, default ``True``): Include the community provided metrics.
         with_details (:obj:`bool`, optional, default ``False``): Return the full details on the metrics instead of only the short name.
@@ -138,10 +148,19 @@ def inspect_dataset(path: str, local_path: str, download_config: Optional[Downlo
     )
 
 
+@deprecated(
+    "Use 'evaluate.inspect_evaluation_module' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate"
+)
 def inspect_metric(path: str, local_path: str, download_config: Optional[DownloadConfig] = None, **download_kwargs):
     r"""
     Allow inspection/modification of a metric script by copying it on local drive at local_path.
 
+    <Deprecated version="2.5.0">
+
+    Use `evaluate.inspect_evaluation_module` instead, from the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+
     Args:
         path (``str``): path to the dataset processing script with the dataset builder. Can be either:
 
diff --git a/src/datasets/load.py b/src/datasets/load.py
index 567587583fd..a7cb698b727 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -21,6 +21,7 @@
 import os
 import shutil
 import time
+import warnings
 from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path
@@ -60,6 +61,7 @@
 )
 from .splits import Split
 from .tasks import TaskTemplate
+from .utils.deprecation_utils import deprecated
 from .utils.file_utils import (
     OfflineModeIsEnabled,
     _raise_if_offline_mode_is_enabled,
@@ -510,8 +512,16 @@ def get_module(self) -> DatasetModule:
 
 
 class GithubMetricModuleFactory(_MetricModuleFactory):
-    """Get the module of a metric. The metric script is downloaded from GitHub."""
+    """Get the module of a metric. The metric script is downloaded from GitHub.
 
+    <Deprecated version="2.5.0">
+
+    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+    """
+
+    @deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
     def __init__(
         self,
         name: str,
@@ -577,8 +587,16 @@ def get_module(self) -> MetricModule:
 
 
 class LocalMetricModuleFactory(_MetricModuleFactory):
-    """Get the module of a local metric. The metric script is loaded from a local script."""
+    """Get the module of a local metric. The metric script is loaded from a local script.
+
+    <Deprecated version="2.5.0">
+
+    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+    """
 
+    @deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
     def __init__(
         self,
         path: str,
@@ -1017,8 +1035,15 @@ class CachedMetricModuleFactory(_MetricModuleFactory):
     """
     Get the module of a metric that has been loaded once already and cached.
     The script that is loaded from the cache is the most recent one with a matching name.
+
+    <Deprecated version="2.5.0">
+
+    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
     """
 
+    @deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
     def __init__(
         self,
         name: str,
@@ -1251,6 +1276,7 @@ def dataset_module_factory(
         )
 
 
+@deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
 def metric_module_factory(
     path: str,
     revision: Optional[Union[str, Version]] = None,
@@ -1262,7 +1288,13 @@ def metric_module_factory(
     """
     Download/extract/cache a metric module.
 
-    Metrics codes are cached inside the the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
+    <Deprecated version="2.5.0">
+
+    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+
+    Metrics codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
 
     Args:
 
@@ -1292,51 +1324,56 @@ def metric_module_factory(
     Returns:
         MetricModule
     """
-    if download_config is None:
-        download_config = DownloadConfig(**download_kwargs)
-    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
-    download_config.extract_compressed_file = True
-    download_config.force_extract = True
-
-    filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
-    if not filename.endswith(".py"):
-        filename = filename + ".py"
-    combined_path = os.path.join(path, filename)
-    # Try locally
-    if path.endswith(filename):
-        if os.path.isfile(path):
+    with warnings.catch_warnings():
+        # Ignore equivalent warnings to the one already issued
+        warnings.filterwarnings("ignore", message=".*https://huggingface.co/docs/evaluate$", category=FutureWarning)
+
+        if download_config is None:
+            download_config = DownloadConfig(**download_kwargs)
+        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        download_config.extract_compressed_file = True
+        download_config.force_extract = True
+
+        filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
+        if not filename.endswith(".py"):
+            filename = filename + ".py"
+        combined_path = os.path.join(path, filename)
+        # Try locally
+        if path.endswith(filename):
+            if os.path.isfile(path):
+                return LocalMetricModuleFactory(
+                    path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
+                ).get_module()
+            else:
+                raise FileNotFoundError(f"Couldn't find a metric script at {relative_to_absolute_path(path)}")
+        elif os.path.isfile(combined_path):
             return LocalMetricModuleFactory(
-                path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
+                combined_path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
             ).get_module()
-        else:
-            raise FileNotFoundError(f"Couldn't find a metric script at {relative_to_absolute_path(path)}")
-    elif os.path.isfile(combined_path):
-        return LocalMetricModuleFactory(
-            combined_path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
-        ).get_module()
-    elif is_relative_path(path) and path.count("/") == 0:
-        try:
-            return GithubMetricModuleFactory(
-                path,
-                revision=revision,
-                download_config=download_config,
-                download_mode=download_mode,
-                dynamic_modules_path=dynamic_modules_path,
-            ).get_module()
-        except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
+        elif is_relative_path(path) and path.count("/") == 0:
             try:
-                return CachedMetricModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
-            except Exception as e2:  # noqa: if it's not in the cache, then it doesn't exist.
-                if not isinstance(e1, FileNotFoundError):
-                    raise e1 from None
-                raise FileNotFoundError(
-                    f"Couldn't find a metric script at {relative_to_absolute_path(combined_path)}. "
-                    f"Metric '{path}' doesn't exist on the Hugging Face Hub either."
-                ) from None
-    else:
-        raise FileNotFoundError(f"Couldn't find a metric script at {relative_to_absolute_path(combined_path)}.")
+                return GithubMetricModuleFactory(
+                    path,
+                    revision=revision,
+                    download_config=download_config,
+                    download_mode=download_mode,
+                    dynamic_modules_path=dynamic_modules_path,
+                ).get_module()
+            except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
+                try:
+                    return CachedMetricModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
+                except Exception as e2:  # noqa: if it's not in the cache, then it doesn't exist.
+                    if not isinstance(e1, FileNotFoundError):
+                        raise e1 from None
+                    raise FileNotFoundError(
+                        f"Couldn't find a metric script at {relative_to_absolute_path(combined_path)}. "
+                        f"Metric '{path}' doesn't exist on the Hugging Face Hub either."
+                    ) from None
+        else:
+            raise FileNotFoundError(f"Couldn't find a metric script at {relative_to_absolute_path(combined_path)}.")
 
 
+@deprecated("Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate")
 def load_metric(
     path: str,
     config_name: Optional[str] = None,
@@ -1352,6 +1389,12 @@ def load_metric(
 ) -> Metric:
     """Load a `datasets.Metric`.
 
+    <Deprecated version="2.5.0">
+
+    Use `evaluate.load` instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+
     Args:
 
         path (``str``):
@@ -1385,25 +1428,29 @@ def load_metric(
     {'accuracy': 0.5}
     ```
     """
-    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
-    metric_module = metric_module_factory(
-        path, revision=revision, download_config=download_config, download_mode=download_mode
-    ).module_path
-    metric_cls = import_main_class(metric_module, dataset=False)
-    metric = metric_cls(
-        config_name=config_name,
-        process_id=process_id,
-        num_process=num_process,
-        cache_dir=cache_dir,
-        keep_in_memory=keep_in_memory,
-        experiment_id=experiment_id,
-        **metric_init_kwargs,
-    )
+    with warnings.catch_warnings():
+        # Ignore equivalent warnings to the one already issued
+        warnings.filterwarnings("ignore", message=".*https://huggingface.co/docs/evaluate$", category=FutureWarning)
+
+        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        metric_module = metric_module_factory(
+            path, revision=revision, download_config=download_config, download_mode=download_mode
+        ).module_path
+        metric_cls = import_main_class(metric_module, dataset=False)
+        metric = metric_cls(
+            config_name=config_name,
+            process_id=process_id,
+            num_process=num_process,
+            cache_dir=cache_dir,
+            keep_in_memory=keep_in_memory,
+            experiment_id=experiment_id,
+            **metric_init_kwargs,
+        )
 
-    # Download and prepare resources for the metric
-    metric.download_and_prepare(download_config=download_config)
+        # Download and prepare resources for the metric
+        metric.download_and_prepare(download_config=download_config)
 
-    return metric
+        return metric
 
 
 def load_dataset_builder(
diff --git a/src/datasets/metric.py b/src/datasets/metric.py
index a53ed7e7613..5aa8a025735 100644
--- a/src/datasets/metric.py
+++ b/src/datasets/metric.py
@@ -31,6 +31,7 @@
 from .features import Features
 from .info import DatasetInfo, MetricInfo
 from .naming import camelcase_to_snakecase
+from .utils.deprecation_utils import deprecated
 from .utils.filelock import BaseFileLock, FileLock, Timeout
 from .utils.logging import get_logger
 from .utils.py_utils import copyfunc, temp_seed
@@ -76,6 +77,13 @@ def format_chunk(chunk):
 class MetricInfoMixin:
     """This base class exposes some attributes of MetricInfo
     at the base level of the Metric for easy access.
+
+    <Deprecated version="2.5.0">
+
+    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+
     """
 
     def __init__(self, info: MetricInfo):
@@ -138,6 +146,12 @@ def format(self) -> Optional[str]:
 class Metric(MetricInfoMixin):
     """A Metric is the base class and common API for all metrics.
 
+    <Deprecated version="2.5.0">
+
+    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
+
+    </Deprecated>
+
     Args:
         config_name (``str``): This is used to define a hash specific to a metrics computation script and prevents the metric's data
             to be overridden when the metric loading script is modified.
@@ -155,6 +169,7 @@ class Metric(MetricInfoMixin):
         timeout (``Union[int, float]``): Timeout in second for distributed setting synchronization.
     """
 
+    @deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
     def __init__(
         self,
         config_name: Optional[str] = None,
diff --git a/src/datasets/utils/deprecation_utils.py b/src/datasets/utils/deprecation_utils.py
index c8fd1305391..6d70f42ae7b 100644
--- a/src/datasets/utils/deprecation_utils.py
+++ b/src/datasets/utils/deprecation_utils.py
@@ -20,12 +20,11 @@ def deprecated(help_message: Optional[str] = None):
 
     def decorator(deprecated_function: Callable):
         global _emitted_deprecation_warnings
+        name = deprecated_function.__name__
+        # Support deprecating __init__ class method: class name instead
+        name = name if name != "__init__" else deprecated_function.__qualname__.split(".")[-2]
         warning_msg = (
-            (
-                f"{deprecated_function.__name__} is deprecated and will be removed "
-                "in the next major version of datasets."
-            )
-            + f" {help_message}"
+            f"{name} is deprecated and will be removed in the next major version of datasets." + f" {help_message}"
             if help_message
             else ""
         )
diff --git a/src/datasets/utils/logging.py b/src/datasets/utils/logging.py
index 811aebbaff0..ebe9389c818 100644
--- a/src/datasets/utils/logging.py
+++ b/src/datasets/utils/logging.py
@@ -77,7 +77,7 @@ def _reset_library_root_logger() -> None:
 
 def get_logger(name: Optional[str] = None) -> logging.Logger:
     """Return a logger with the specified name.
-    This function can be used in dataset and metrics scripts.
+    This function can be used in dataset scripts.
     """
     if name is None:
         name = _get_library_name()
diff --git a/templates/new_metric_script.py b/templates/new_metric_script.py
deleted file mode 100644
index 5168beda388..00000000000
--- a/templates/new_metric_script.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TODO: Add a description here."""
-
-import datasets
-
-
-# TODO: Add BibTeX citation
-_CITATION = """\
-@InProceedings{huggingface:metric,
-title = {A great new metric},
-authors={huggingface, Inc.},
-year={2020}
-}
-"""
-
-# TODO: Add description of the metric here
-_DESCRIPTION = """\
-This new metric is designed to solve this great NLP task and is crafted with a lot of care.
-"""
-
-
-# TODO: Add description of the arguments of the metric here
-_KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
-Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
-Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
-Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-
-    >>> my_new_metric = datasets.load_metric("my_new_metric")
-    >>> results = my_new_metric.compute(references=[0, 1], predictions=[0, 1])
-    >>> print(results)
-    {'accuracy': 1.0}
-"""
-
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class NewMetric(datasets.Metric):
-    """TODO: Short description of my metric."""
-
-    def _info(self):
-        # TODO: Specifies the datasets.MetricInfo object
-        return datasets.MetricInfo(
-            # This is the description that will appear on the metrics page.
-            description=_DESCRIPTION,
-            citation=_CITATION,
-            inputs_description=_KWARGS_DESCRIPTION,
-            # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('string'),
-                'references': datasets.Value('string'),
-            }),
-            # Homepage of the metric for documentation
-            homepage="http://metric.homepage",
-            # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_metric"],
-            reference_urls=["http://path.to.reference.url/new_metric"]
-        )
-
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        bad_words_path = dl_manager.download_and_extract(BAD_WORDS_URL)
-        self.bad_words = {w.strip() for w in open(bad_words_path, encoding="utf-8")}
-
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the metric
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-
-        if self.config_name == "max":
-            second_score = max(abs(len(i) - len(j)) for i, j in zip(predictions, references) if i not in self.bad_words)
-        elif self.config_name == "mean":
-            second_score = sum(abs(len(i) - len(j)) for i, j in zip(predictions, references) if i not in self.bad_words)
-            second_score /= sum(i not in self.bad_words for i in predictions)
-        else:
-            raise ValueError(f"Invalid config name for NewMetric: {self.config_name}. Please use 'max' or 'mean'.")
-
-        return {
-            "accuracy": accuracy,
-            "second_score": second_score,
-        }
diff --git a/tests/test_warnings.py b/tests/test_warnings.py
new file mode 100644
index 00000000000..eedcbb82ae4
--- /dev/null
+++ b/tests/test_warnings.py
@@ -0,0 +1,34 @@
+import pytest
+
+from datasets import inspect_metric, list_metrics, load_metric
+
+
+@pytest.fixture
+def mock_emitted_deprecation_warnings(monkeypatch):
+    monkeypatch.setattr("datasets.utils.deprecation_utils._emitted_deprecation_warnings", set())
+
+
+# Used by list_metrics
+@pytest.fixture
+def mock_hfh(monkeypatch):
+    class MetricMock:
+        def __init__(self, metric_id):
+            self.id = metric_id
+
+    class HfhMock:
+        _metrics = [MetricMock(metric_id) for metric_id in ["accuracy", "mse", "precision", "codeparrot/apps_metric"]]
+
+        def list_metrics(self):
+            return self._metrics
+
+    monkeypatch.setattr("datasets.inspect.huggingface_hub", HfhMock())
+
+
+@pytest.mark.parametrize(
+    "func, args", [(load_metric, ("metrics/mse",)), (list_metrics, ()), (inspect_metric, ("metrics/mse", "tmp_path"))]
+)
+def test_metric_deprecation_warning(func, args, mock_emitted_deprecation_warnings, mock_hfh, tmp_path):
+    if "tmp_path" in args:
+        args = tuple(arg if arg != "tmp_path" else tmp_path for arg in args)
+    with pytest.warns(FutureWarning, match="https://huggingface.co/docs/evaluate"):
+        func(*args)