diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1e4f2e3192d..69ac69332df 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -152,15 +152,6 @@ commands:
           args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
           descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
 
-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
   # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
   # This command can be used if only a selection of tests need to be run, for ad-hoc files.
   run_tests_selective:
@@ -326,7 +317,6 @@ jobs:
       - checkout
       - install_torchvision:
           editable: true
-      - install_prototype_dependencies
       - pip_install:
           args: mypy
           descr: Install Python type check utilities
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
index d93ddb0bed3..829a21568d5 100644
--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -152,15 +152,6 @@ commands:
           args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
           descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
 
-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
   # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
   # This command can be used if only a selection of tests need to be run, for ad-hoc files.
   run_tests_selective:
@@ -326,7 +317,6 @@ jobs:
       - checkout
       - install_torchvision:
           editable: true
-      - install_prototype_dependencies
       - pip_install:
           args: mypy
           descr: Install Python type check utilities
diff --git a/.github/workflows/prototype-tests.yml b/.github/workflows/prototype-tests.yml
deleted file mode 100644
index 5e9ca360d08..00000000000
--- a/.github/workflows/prototype-tests.yml
+++ /dev/null
@@ -1,83 +0,0 @@
-name: tests
-
-on:
-  pull_request:
-
-jobs:
-  prototype:
-    strategy:
-      matrix:
-        os:
-          - ubuntu-latest
-          - windows-latest
-          - macos-latest
-      fail-fast: false
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Set up python
-        uses: actions/setup-python@v3
-        with:
-          python-version: 3.7
-
-      - name: Upgrade system packages
-        run: python -m pip install --upgrade pip setuptools wheel
-
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install PyTorch nightly builds
-        run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
-
-      - name: Install torchvision
-        run: pip install --progress-bar=off --no-build-isolation --editable .
-
-      - name: Install other prototype dependencies
-        run: pip install --progress-bar=off scipy pycocotools h5py iopath
-
-      - name: Install test requirements
-        run: pip install --progress-bar=off pytest pytest-mock pytest-cov
-
-      - name: Mark setup as complete
-        id: setup
-        run: exit 0
-
-      - name: Run prototype features tests
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/features \
-            --cov-report=term-missing \
-            test/test_prototype_features*.py
-
-      - name: Run prototype datasets tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/datasets \
-            --cov-report=term-missing \
-            test/test_prototype_datasets*.py
-
-      - name: Run prototype transforms tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/transforms \
-            --cov-report=term-missing \
-            test/test_prototype_transforms*.py
-
-      - name: Run prototype models tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/models \
-            --cov-report=term-missing \
-            test/test_prototype_models*.py
diff --git a/mypy.ini b/mypy.ini
index c1d174f4595..aaeea57a691 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,52 +7,6 @@ allow_redefinition = True
 no_implicit_optional = True
 warn_redundant_casts = True
 
-[mypy-torchvision.prototype.features.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
-[mypy-torchvision.prototype.transforms.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
-[mypy-torchvision.prototype.datasets.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-warn_unreachable = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
 [mypy-torchvision.io.image.*]
 
 ignore_errors = True
@@ -149,10 +103,6 @@ ignore_missing_imports = True
 
 ignore_missing_imports = True
 
-[mypy-torchdata.*]
-
-ignore_missing_imports = True
-
 [mypy-h5py.*]
 
 ignore_missing_imports = True
diff --git a/pytest.ini b/pytest.ini
index 1dde465d32f..ca753944859 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -7,7 +7,6 @@ addopts =
     # enable all warnings
     -Wd
     --ignore=test/test_datasets_download.py
-    --ignore-glob=test/test_prototype_*.py
 testpaths =
     test
 xfail_strict = True
diff --git a/references/depth/stereo/README.md b/references/depth/stereo/README.md
deleted file mode 100644
index 922fbcdb3ae..00000000000
--- a/references/depth/stereo/README.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# Stereo Matching reference training scripts
-
-This folder contains reference training scripts for Stereo Matching.
-They serve as a log of how to train specific models, so as to provide baseline
-training and evaluation scripts to quickly bootstrap research.
-
-
-### CREStereo
-
-The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**.
-A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well.
-Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The
-rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo.
-The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate
-schedule to one that starts decaying the weight much sooner. Throughout experiments we found that this reduces overfitting
-during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
-
-```
-torchrun --nproc_per_node 8 --nnodes 1 train.py \
-    --dataset-root $dataset_root \
-    --name $name_cre \
-    --model crestereo_base \
-    --train-datasets crestereo eth3d-train middlebury2014-other \
-    --dataset-steps 264000 18000 18000
-    --batch-size 2 \
-    --lr 0.0004 \
-    --min-lr 0.00002 \
-    --lr-decay-method cosine \
-    --warmup-steps 6000 \
-    --decay-after-steps 30000 \
-    --clip-grad-norm 1.0 \
-```
-
-We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggresive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
-
-```
-torchrun --nproc_per_node 8 --nnodes 1 train.py \
-    --dataset-root $dataset_root \
-    --name $name_things \
-    --model crestereo_base \
-    --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \
-    --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000
-    --batch-size 2 \
-    --scale-range 0.2 0.8 \
-    --lr 0.0004 \
-    --lr-decay-method cosine \
-    --decay-after-steps 0 \
-    --warmup-steps 0 \
-    --min-lr 0.00002 \
-    --resume-path $checkpoint_dir/$name_cre.pth
-```
-
-
-### Evaluation
-
-Evaluating the base weights
-
-```
-torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1
-```
-
-This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate resuts use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
-
-```
-Dataset: middlebury2014-train @size: [384, 512]:
-{
-	1: {
-		2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511}
-		5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128}
-		10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388}
-		20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7}
-	},
-}
-{
-	2: {
-		2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396}
-		5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186}
-		10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379}
-		20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054}
-	},
-}
-```
-
-You can also evaluate the Finetuned weights:
-
-```
-torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1
-```
-
-```
-Dataset: middlebury2014-train @size: [384, 512]:
-{
-	1: {
-		2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736}
-		5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596}
-		10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042}
-		20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784}
-	},
-}
-{
-	2: {
-		2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89}
-		5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556}
-		10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464}
-		20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417}
-	},
-}
-```
-
-Evaluating the author provided weights:
-
-```
-torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1
-```
-
-```
-Dataset: middlebury2014-train @size: [384, 512]:
-{
-	1: {
-		2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464}
-		5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186}
-		10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429}
-		20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807}
-	},
-}
-{
-	2: {
-		2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313}
-		5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979}
-		10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43}
-		20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269}
-	},
-}
-```
-
-# Concerns when training
-
-We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targetting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence with naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
-
- Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments.
-
-### Disparity scaling
-
-##### Sample A
- The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`.
-
-![Disparity1](assets/disparity-domain-drift.jpg)
-
-From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correcly estimate the continous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremly large scene the crop size of `384x512` does not correctly capture the general training distribution.
-
-
-
-
-##### Sample B
-
-The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exagerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
-
-![Disparity2](assets/disparity-background-mode-collapse.jpg)
-
-
-For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493
-
-
-### Metric overfitting
-
-##### Learning is critical in the beginning
-
-We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule.
-
-![Loss1](assets/Loss.jpg)
-
-In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration.
-
-##### Gradient norm saves time
-
-![Loss2](assets/gradient-norm-removal.jpg)
-
-In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**.
-
-Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for.
diff --git a/references/depth/stereo/__init__.py b/references/depth/stereo/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/references/depth/stereo/assets/Loss.jpg b/references/depth/stereo/assets/Loss.jpg
deleted file mode 100644
index b6db8e204af..00000000000
Binary files a/references/depth/stereo/assets/Loss.jpg and /dev/null differ
diff --git a/references/depth/stereo/assets/disparity-background-mode-collapse.jpg b/references/depth/stereo/assets/disparity-background-mode-collapse.jpg
deleted file mode 100644
index b6542e8814f..00000000000
Binary files a/references/depth/stereo/assets/disparity-background-mode-collapse.jpg and /dev/null differ
diff --git a/references/depth/stereo/assets/disparity-domain-drift.jpg b/references/depth/stereo/assets/disparity-domain-drift.jpg
deleted file mode 100644
index 8a98de03675..00000000000
Binary files a/references/depth/stereo/assets/disparity-domain-drift.jpg and /dev/null differ
diff --git a/references/depth/stereo/assets/gradient-norm-removal.jpg b/references/depth/stereo/assets/gradient-norm-removal.jpg
deleted file mode 100644
index 2c3c8459d5e..00000000000
Binary files a/references/depth/stereo/assets/gradient-norm-removal.jpg and /dev/null differ
diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py
deleted file mode 100644
index 0fe8eb3b1a6..00000000000
--- a/references/depth/stereo/cascade_evaluation.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import os
-import warnings
-
-import torch
-import torchvision
-import torchvision.prototype.models.depth.stereo
-import utils
-from torch.nn import functional as F
-from train import make_eval_loader
-
-from utils.metrics import AVAILABLE_METRICS
-from vizualization import make_prediction_image_side_to_side
-
-
-def get_args_parser(add_help=True):
-    import argparse
-
-    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help)
-    parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use")
-    parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset")
-
-    parser.add_argument("--checkpoint", type=str, default="", help="path to weights")
-    parser.add_argument("--weights", type=str, default=None, help="torchvision API weight")
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="crestereo_base",
-        help="which model to use if not speciffying a training checkpoint",
-    )
-    parser.add_argument("--img-folder", type=str, default="images")
-
-    parser.add_argument("--batch-size", type=int, default=1, help="batch size")
-    parser.add_argument("--workers", type=int, default=0, help="number of workers")
-
-    parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size")
-    parser.add_argument(
-        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
-    )
-    parser.add_argument(
-        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
-    )
-    parser.add_argument(
-        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
-    )
-    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
-    parser.add_argument(
-        "--interpolation-strategy",
-        type=str,
-        default="bilinear",
-        help="interpolation strategy",
-        choices=["bilinear", "bicubic", "mixed"],
-    )
-
-    parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations")
-    parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades")
-    parser.add_argument(
-        "--metrics",
-        type=str,
-        nargs="+",
-        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
-        help="metrics to log",
-        choices=AVAILABLE_METRICS,
-    )
-    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
-
-    parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes")
-    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
-    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
-
-    parser.add_argument("--save-images", action="store_true", help="save images of the predictions")
-    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
-
-    return parser
-
-
-def cascade_inference(model, image_left, image_right, iterations, cascades):
-    # check that image size is divisible by 16 * (2 ** (cascades - 1))
-    for image in [image_left, image_right]:
-        if image.shape[-2] % ((2 ** (cascades - 1))) != 0:
-            raise ValueError(
-                f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
-            )
-
-        if image.shape[-1] % ((2 ** (cascades - 1))) != 0:
-            raise ValueError(
-                f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
-            )
-
-    left_image_pyramid = [image_left]
-    right_image_pyramid = [image_right]
-    for idx in range(0, cascades - 1):
-        ds_factor = int(2 ** (idx + 1))
-        ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor)
-        left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0)
-        right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(
-            0
-        )
-
-    flow_init = None
-    for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)):
-        flow_pred = model(left_image, right_image, flow_init, num_iters=iterations)
-        # flow pred is a list
-        flow_init = flow_pred[-1]
-
-    return flow_init
-
-
-@torch.inference_mode()
-def _evaluate(
-    model,
-    args,
-    val_loader,
-    *,
-    padder_mode,
-    print_freq=10,
-    writter=None,
-    step=None,
-    iterations=10,
-    cascades=1,
-    batch_size=None,
-    header=None,
-    save_images=False,
-    save_path="",
-):
-    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.
-    We process as many samples as possible with ddp.
-    """
-    model.eval()
-    header = header or "Test:"
-    device = torch.device(args.device)
-    metric_logger = utils.MetricLogger(delimiter="  ")
-
-    iterations = iterations or args.recurrent_updates
-
-    logger = utils.MetricLogger()
-    for meter_name in args.metrics:
-        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
-    if "fl-all" not in args.metrics:
-        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
-
-    num_processed_samples = 0
-    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
-        batch_idx = 0
-        for blob in metric_logger.log_every(val_loader, print_freq, header):
-            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
-            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
-            image_left, image_right = padder.pad(image_left, image_right)
-
-            disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades)
-            disp_pred = disp_pred[:, :1, :, :]
-            disp_pred = padder.unpad(disp_pred)
-
-            if save_images:
-                if args.distributed:
-                    rank_prefix = args.rank
-                else:
-                    rank_prefix = 0
-                make_prediction_image_side_to_side(
-                    disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}"
-                )
-
-            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
-            num_processed_samples += image_left.shape[0]
-            for name in metrics:
-                logger.meters[name].update(metrics[name], n=1)
-
-            batch_idx += 1
-
-    num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size
-
-    print("Num_processed_samples: ", num_processed_samples)
-    if (
-        hasattr(val_loader.dataset, "__len__")
-        and len(val_loader.dataset) != num_processed_samples
-        and torch.distributed.get_rank() == 0
-    ):
-        warnings.warn(
-            f"Number of processed samples {num_processed_samples} is different"
-            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
-            "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results."
-        )
-
-    if writter is not None and args.rank == 0:
-        for meter_name, meter_value in logger.meters.items():
-            scalar_name = f"{meter_name} {header}"
-            writter.add_scalar(scalar_name, meter_value.avg, step)
-
-    logger.synchronize_between_processes()
-    print(header, logger)
-
-    logger_metrics = {k: v.global_avg for k, v in logger.meters.items()}
-    return logger_metrics
-
-
-def evaluate(model, loader, args, writter=None, step=None):
-    os.makedirs(args.img_folder, exist_ok=True)
-    checkpoint_name = os.path.basename(args.checkpoint) or args.weights
-    image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name)
-
-    metrics = {}
-    base_image_folder = os.path.join(image_checkpoint_folder, args.dataset)
-    os.makedirs(base_image_folder, exist_ok=True)
-
-    for n_cascades in args.n_cascades:
-        for n_iters in args.n_iterations:
-
-            config = f"{n_cascades}c_{n_iters}i"
-            config_image_folder = os.path.join(base_image_folder, config)
-            os.makedirs(config_image_folder, exist_ok=True)
-
-            metrics[config] = _evaluate(
-                model,
-                args,
-                loader,
-                padder_mode=args.padder_type,
-                header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}",
-                batch_size=args.batch_size,
-                writter=writter,
-                step=step,
-                iterations=n_iters,
-                cascades=n_cascades,
-                save_path=config_image_folder,
-                save_images=args.save_images,
-            )
-
-    metric_log = []
-    metric_log_dict = {}
-    # print the final results
-    for config in metrics:
-        config_tokens = config.split("_")
-        config_iters = config_tokens[1][:-1]
-        config_cascades = config_tokens[0][:-1]
-
-        metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {})
-        metric_log_dict[config_cascades][config_iters] = metrics[config]
-
-        evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}"
-        metrics_str = f"Metrics: {metrics[config]}"
-        metric_log.extend([evaluation_str, metrics_str])
-
-        print(evaluation_str)
-        print(metrics_str)
-
-    eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log"
-    print("Saving eval log to: ", eval_log_name)
-    with open(eval_log_name, "w") as f:
-        f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n")
-        # write the dict line by line for each key, and each value in the keys
-        for config_cascades in metric_log_dict:
-            f.write("{\n")
-            f.write(f"\t{config_cascades}: {{\n")
-            for config_iters in metric_log_dict[config_cascades]:
-                # convert every metric to 4 decimal places
-                metrics = metric_log_dict[config_cascades][config_iters]
-                metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()}
-                f.write(f"\t\t{config_iters}: {metrics}\n")
-            f.write("\t},\n")
-            f.write("}\n")
-
-
-def load_checkpoint(args):
-    utils.setup_ddp(args)
-
-    if not args.weights:
-        checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu"))
-        if "model" in checkpoint:
-            experiment_args = checkpoint["args"]
-            model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None)
-            model.load_state_dict(checkpoint["model"])
-        else:
-            model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None)
-            model.load_state_dict(checkpoint)
-
-        # set the appropiate devices
-        if args.distributed and args.device == "cpu":
-            raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
-        device = torch.device(args.device)
-    else:
-        model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
-
-    # convert to DDP if need be
-    if args.distributed:
-        model = model.to(args.device)
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-    else:
-        model.to(device)
-
-    return model
-
-
-def main(args):
-    model = load_checkpoint(args)
-    loader = make_eval_loader(args.dataset, args)
-    evaluate(model, loader, args)
-
-
-if __name__ == "__main__":
-    args = get_args_parser().parse_args()
-    main(args)
diff --git a/references/depth/stereo/parsing.py b/references/depth/stereo/parsing.py
deleted file mode 100644
index 71a3ba9904e..00000000000
--- a/references/depth/stereo/parsing.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import argparse
-from functools import partial
-
-import torch
-
-from presets import StereoMatchingEvalPreset, StereoMatchingTrainPreset
-from torchvision.datasets import (
-    CarlaStereo,
-    CREStereo,
-    ETH3DStereo,
-    FallingThingsStereo,
-    InStereo2k,
-    Kitti2012Stereo,
-    Kitti2015Stereo,
-    Middlebury2014Stereo,
-    SceneFlowStereo,
-    SintelStereo,
-)
-
-VALID_DATASETS = {
-    "crestereo": partial(CREStereo),
-    "carla-highres": partial(CarlaStereo),
-    "instereo2k": partial(InStereo2k),
-    "sintel": partial(SintelStereo),
-    "sceneflow-monkaa": partial(SceneFlowStereo, variant="Monkaa", pass_name="both"),
-    "sceneflow-flyingthings": partial(SceneFlowStereo, variant="FlyingThings3D", pass_name="both"),
-    "sceneflow-driving": partial(SceneFlowStereo, variant="Driving", pass_name="both"),
-    "fallingthings": partial(FallingThingsStereo, variant="both"),
-    "eth3d-train": partial(ETH3DStereo, split="train"),
-    "eth3d-test": partial(ETH3DStereo, split="test"),
-    "kitti2015-train": partial(Kitti2015Stereo, split="train"),
-    "kitti2015-test": partial(Kitti2015Stereo, split="test"),
-    "kitti2012-train": partial(Kitti2012Stereo, split="train"),
-    "kitti2012-test": partial(Kitti2012Stereo, split="train"),
-    "middlebury2014-other": partial(
-        Middlebury2014Stereo, split="additional", use_ambient_view=True, calibration="both"
-    ),
-    "middlebury2014-train": partial(Middlebury2014Stereo, split="train", calibration="perfect"),
-    "middlebury2014-test": partial(Middlebury2014Stereo, split="test", calibration=None),
-    "middlebury2014-train-ambient": partial(
-        Middlebury2014Stereo, split="train", use_ambient_views=True, calibrartion="perfect"
-    ),
-}
-
-
-def make_train_transform(args: argparse.Namespace) -> torch.nn.Module:
-    return StereoMatchingTrainPreset(
-        resize_size=args.resize_size,
-        crop_size=args.crop_size,
-        rescale_prob=args.rescale_prob,
-        scaling_type=args.scaling_type,
-        scale_range=args.scale_range,
-        scale_interpolation_type=args.interpolation_strategy,
-        use_grayscale=args.use_grayscale,
-        mean=args.norm_mean,
-        std=args.norm_std,
-        horizontal_flip_prob=args.flip_prob,
-        gpu_transforms=args.gpu_transforms,
-        max_disparity=args.max_disparity,
-        spatial_shift_prob=args.spatial_shift_prob,
-        spatial_shift_max_angle=args.spatial_shift_max_angle,
-        spatial_shift_max_displacement=args.spatial_shift_max_displacement,
-        spatial_shift_interpolation_type=args.interpolation_strategy,
-        gamma_range=args.gamma_range,
-        brightness=args.brightness_range,
-        contrast=args.contrast_range,
-        saturation=args.saturation_range,
-        hue=args.hue_range,
-        asymmetric_jitter_prob=args.asymmetric_jitter_prob,
-    )
-
-
-def make_eval_transform(args: argparse.Namespace) -> torch.nn.Module:
-    if args.eval_size is None:
-        resize_size = args.crop_size
-    else:
-        resize_size = args.eval_size
-
-    return StereoMatchingEvalPreset(
-        mean=args.norm_mean,
-        std=args.norm_std,
-        use_grayscale=args.use_grayscale,
-        resize_size=resize_size,
-        interpolation_type=args.interpolation_strategy,
-    )
-
-
-def make_dataset(dataset_name: str, dataset_root: str, transforms: torch.nn.Module) -> torch.utils.data.Dataset:
-    return VALID_DATASETS[dataset_name](root=dataset_root, transforms=transforms)
diff --git a/references/depth/stereo/presets.py b/references/depth/stereo/presets.py
deleted file mode 100644
index cadd2405178..00000000000
--- a/references/depth/stereo/presets.py
+++ /dev/null
@@ -1,144 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-import transforms as T
-
-
-class StereoMatchingEvalPreset(torch.nn.Module):
-    def __init__(
-        self,
-        mean: float = 0.5,
-        std: float = 0.5,
-        resize_size: Optional[Tuple[int, ...]] = None,
-        max_disparity: Optional[float] = None,
-        interpolation_type: str = "bilinear",
-        use_grayscale: bool = False,
-    ) -> None:
-        super().__init__()
-
-        transforms = [
-            T.ToTensor(),
-            T.ConvertImageDtype(torch.float32),
-        ]
-
-        if use_grayscale:
-            transforms.append(T.ConvertToGrayscale())
-
-        if resize_size is not None:
-            transforms.append(T.Resize(resize_size, interpolation_type=interpolation_type))
-
-        transforms.extend(
-            [
-                T.Normalize(mean=mean, std=std),
-                T.MakeValidDisparityMask(max_disparity=max_disparity),
-                T.ValidateModelInput(),
-            ]
-        )
-
-        self.transforms = T.Compose(transforms)
-
-    def forward(self, images, disparities, masks):
-        return self.transforms(images, disparities, masks)
-
-
-class StereoMatchingTrainPreset(torch.nn.Module):
-    def __init__(
-        self,
-        *,
-        resize_size: Optional[Tuple[int, ...]],
-        resize_interpolation_type: str = "bilinear",
-        # RandomResizeAndCrop params
-        crop_size: Tuple[int, int],
-        rescale_prob: float = 1.0,
-        scaling_type: str = "exponential",
-        scale_range: Tuple[float, float] = (-0.2, 0.5),
-        scale_interpolation_type: str = "bilinear",
-        # convert to grayscale
-        use_grayscale: bool = False,
-        # normalization params
-        mean: float = 0.5,
-        std: float = 0.5,
-        # processing device
-        gpu_transforms: bool = False,
-        # masking
-        max_disparity: Optional[int] = 256,
-        # SpatialShift params
-        spatial_shift_prob: float = 0.5,
-        spatial_shift_max_angle: float = 0.5,
-        spatial_shift_max_displacement: float = 0.5,
-        spatial_shift_interpolation_type: str = "bilinear",
-        # AssymetricColorJitter
-        gamma_range: Tuple[float, float] = (0.8, 1.2),
-        brightness: Union[int, Tuple[int, int]] = (0.8, 1.2),
-        contrast: Union[int, Tuple[int, int]] = (0.8, 1.2),
-        saturation: Union[int, Tuple[int, int]] = 0.0,
-        hue: Union[int, Tuple[int, int]] = 0.0,
-        asymmetric_jitter_prob: float = 1.0,
-        # RandomHorizontalFlip
-        horizontal_flip_prob: float = 0.5,
-        # RandomOcclusion
-        occlusion_prob: float = 0.0,
-        occlusion_px_range: Tuple[int, int] = (50, 100),
-        # RandomErase
-        erase_prob: float = 0.0,
-        erase_px_range: Tuple[int, int] = (50, 100),
-        erase_num_repeats: int = 1,
-    ) -> None:
-
-        if scaling_type not in ["linear", "exponential"]:
-            raise ValueError(f"Unknown scaling type: {scaling_type}. Available types: linear, exponential")
-
-        super().__init__()
-        transforms = [T.ToTensor()]
-
-        # when fixing size across multiple datasets, we ensure
-        # that the same size is used for all datasets when cropping
-        if resize_size is not None:
-            transforms.append(T.Resize(resize_size, interpolation_type=resize_interpolation_type))
-
-        if gpu_transforms:
-            transforms.append(T.ToGPU())
-
-        # color handling
-        color_transforms = [
-            T.AsymmetricColorJitter(
-                brightness=brightness, contrast=contrast, saturation=saturation, hue=hue, p=asymmetric_jitter_prob
-            ),
-            T.AsymetricGammaAdjust(p=asymmetric_jitter_prob, gamma_range=gamma_range),
-        ]
-
-        if use_grayscale:
-            color_transforms.append(T.ConvertToGrayscale())
-
-        transforms.extend(color_transforms)
-
-        transforms.extend(
-            [
-                T.RandomSpatialShift(
-                    p=spatial_shift_prob,
-                    max_angle=spatial_shift_max_angle,
-                    max_px_shift=spatial_shift_max_displacement,
-                    interpolation_type=spatial_shift_interpolation_type,
-                ),
-                T.ConvertImageDtype(torch.float32),
-                T.RandomRescaleAndCrop(
-                    crop_size=crop_size,
-                    scale_range=scale_range,
-                    rescale_prob=rescale_prob,
-                    scaling_type=scaling_type,
-                    interpolation_type=scale_interpolation_type,
-                ),
-                T.RandomHorizontalFlip(horizontal_flip_prob),
-                # occlusion after flip, otherwise we're occluding the reference image
-                T.RandomOcclusion(p=occlusion_prob, occlusion_px_range=occlusion_px_range),
-                T.RandomErase(p=erase_prob, erase_px_range=erase_px_range, max_erase=erase_num_repeats),
-                T.Normalize(mean=mean, std=std),
-                T.MakeValidDisparityMask(max_disparity),
-                T.ValidateModelInput(),
-            ]
-        )
-
-        self.transforms = T.Compose(transforms)
-
-    def forward(self, images, disparties, mask):
-        return self.transforms(images, disparties, mask)
diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py
deleted file mode 100644
index 548387f969d..00000000000
--- a/references/depth/stereo/train.py
+++ /dev/null
@@ -1,788 +0,0 @@
-import argparse
-import os
-import warnings
-from pathlib import Path
-from typing import List, Union
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torchvision.models.optical_flow
-import torchvision.prototype.models.depth.stereo
-import utils
-import vizualization
-
-from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS
-from torch import nn
-from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize
-from utils.metrics import AVAILABLE_METRICS
-from utils.norm import freeze_batch_norm
-
-
-def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor:
-    """Helper function to make stereo flow from a given model output"""
-    if isinstance(flow, list):
-        return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow]
-
-    B, C, H, W = flow.shape
-    # we need to add zero flow if the model outputs 2 channels
-    if C == 1 and model_out_channels == 2:
-        zero_flow = torch.zeros_like(flow)
-        # by convention the flow is X-Y axis, so we need the Y flow last
-        flow = torch.cat([flow, zero_flow], dim=1)
-    return flow
-
-
-def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
-    """Helper function to return a learning rate scheduler for CRE-stereo"""
-    if args.decay_after_steps < args.warmup_steps:
-        raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")
-
-    warmup_steps = args.warmup_steps if args.warmup_steps else 0
-    flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0
-    decay_lr_steps = args.total_iterations - flat_lr_steps
-
-    max_lr = args.lr
-    min_lr = args.min_lr
-
-    schedulers = []
-    milestones = []
-
-    if warmup_steps > 0:
-        if args.lr_warmup_method == "linear":
-            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps
-            )
-        elif args.lr_warmup_method == "constant":
-            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
-                optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps
-            )
-        else:
-            raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}")
-        schedulers.append(warmup_lr_scheduler)
-        milestones.append(warmup_steps)
-
-    if flat_lr_steps > 0:
-        flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps)
-        schedulers.append(flat_lr_scheduler)
-        milestones.append(flat_lr_steps + warmup_steps)
-
-    if decay_lr_steps > 0:
-        if args.lr_decay_method == "cosine":
-            decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-                optimizer, T_max=decay_lr_steps, eta_min=min_lr
-            )
-        elif args.lr_decay_method == "linear":
-            decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps
-            )
-        elif args.lr_decay_method == "exponential":
-            decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
-                optimizer, gamma=args.lr_decay_gamma, last_epoch=-1
-            )
-        else:
-            raise ValueError(f"Unknown lr decay method {args.lr_decay_method}")
-        schedulers.append(decay_lr_scheduler)
-
-    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones)
-    return scheduler
-
-
-def shuffle_dataset(dataset):
-    """Shuffle the dataset"""
-    perm = torch.randperm(len(dataset))
-    return torch.utils.data.Subset(dataset, perm)
-
-
-def resize_dataset_to_n_steps(
-    dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace
-) -> torch.utils.data.Dataset:
-    original_size = len(dataset)
-    if args.steps_is_epochs:
-        samples_per_step = original_size
-    target_size = dataset_steps * samples_per_step
-
-    dataset_copies = []
-    n_expands, remainder = divmod(target_size, original_size)
-    for idx in range(n_expands):
-        dataset_copies.append(dataset)
-
-    if remainder > 0:
-        dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder))))
-
-    if args.dataset_shuffle:
-        dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies]
-
-    dataset = torch.utils.data.ConcatDataset(dataset_copies)
-    return dataset
-
-
-def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset:
-    datasets = []
-    for dataset_name in args.train_datasets:
-        transform = make_train_transform(args)
-        dataset = make_dataset(dataset_name, dataset_root, transform)
-        datasets.append(dataset)
-
-    if len(datasets) == 0:
-        raise ValueError("No datasets specified for training")
-
-    samples_per_step = args.world_size * args.batch_size
-
-    for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)):
-        datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args)
-
-    dataset = torch.utils.data.ConcatDataset(datasets)
-    if args.dataset_order_shuffle:
-        dataset = shuffle_dataset(dataset)
-
-    print(f"Training dataset: {len(dataset)} samples")
-    return dataset
-
-
-@torch.inference_mode()
-def _evaluate(
-    model,
-    args,
-    val_loader,
-    *,
-    padder_mode,
-    print_freq=10,
-    writter=None,
-    step=None,
-    iterations=None,
-    batch_size=None,
-    header=None,
-):
-    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset."""
-    model.eval()
-    header = header or "Test:"
-    device = torch.device(args.device)
-    metric_logger = utils.MetricLogger(delimiter="  ")
-
-    iterations = iterations or args.recurrent_updates
-
-    logger = utils.MetricLogger()
-    for meter_name in args.metrics:
-        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
-    if "fl-all" not in args.metrics:
-        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
-
-    num_processed_samples = 0
-    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
-        for blob in metric_logger.log_every(val_loader, print_freq, header):
-            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
-            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
-            image_left, image_right = padder.pad(image_left, image_right)
-
-            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations)
-            disp_pred = disp_predictions[-1][:, :1, :, :]
-            disp_pred = padder.unpad(disp_pred)
-
-            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
-            num_processed_samples += image_left.shape[0]
-            for name in metrics:
-                logger.meters[name].update(metrics[name], n=1)
-
-    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
-
-    print("Num_processed_samples: ", num_processed_samples)
-    if (
-        hasattr(val_loader.dataset, "__len__")
-        and len(val_loader.dataset) != num_processed_samples
-        and torch.distributed.get_rank() == 0
-    ):
-        warnings.warn(
-            f"Number of processed samples {num_processed_samples} is different"
-            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
-            "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results."
-        )
-
-    if writter is not None and args.rank == 0:
-        for meter_name, meter_value in logger.meters.items():
-            scalar_name = f"{meter_name} {header}"
-            writter.add_scalar(scalar_name, meter_value.avg, step)
-
-    logger.synchronize_between_processes()
-    print(header, logger)
-
-
-def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader:
-    if args.weights:
-        weights = torchvision.models.get_weight(args.weights)
-        trans = weights.transforms()
-
-        def preprocessing(image_left, image_right, disp, valid_disp_mask):
-            C_o, H_o, W_o = get_dimensions(image_left)
-            image_left, image_right = trans(image_left, image_right)
-
-            C_t, H_t, W_t = get_dimensions(image_left)
-            scale_factor = W_t / W_o
-
-            if disp is not None and not isinstance(disp, torch.Tensor):
-                disp = torch.from_numpy(disp)
-                if W_t != W_o:
-                    disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor
-            if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor):
-                valid_disp_mask = torch.from_numpy(valid_disp_mask)
-                if W_t != W_o:
-                    valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST)
-            return image_left, image_right, disp, valid_disp_mask
-
-    else:
-        preprocessing = make_eval_transform(args)
-
-    val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing)
-    if args.distributed:
-        sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False)
-    else:
-        sampler = torch.utils.data.SequentialSampler(val_dataset)
-
-    val_loader = torch.utils.data.DataLoader(
-        val_dataset,
-        sampler=sampler,
-        batch_size=args.batch_size,
-        pin_memory=True,
-        num_workers=args.workers,
-    )
-
-    return val_loader
-
-
-def evaluate(model, loaders, args, writter=None, step=None):
-    for loader_name, loader in loaders.items():
-        _evaluate(
-            model,
-            args,
-            loader,
-            iterations=args.recurrent_updates,
-            padder_mode=args.padder_type,
-            header=f"{loader_name} evaluation",
-            batch_size=args.batch_size,
-            writter=writter,
-            step=step,
-        )
-
-
-def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args):
-    device = torch.device(args.device)
-    # wrap the loader in a logger
-    loader = iter(logger.log_every(train_loader))
-    # output channels
-    model_out_channels = model.module.output_channels if args.distributed else model.output_channels
-
-    torch.set_num_threads(args.threads)
-
-    sequence_criterion = utils.SequenceLoss(
-        gamma=args.gamma,
-        max_flow=args.max_disparity,
-        exclude_large_flows=args.flow_loss_exclude_large,
-    ).to(device)
-
-    if args.consistency_weight:
-        consistency_criterion = utils.FlowSequenceConsistencyLoss(
-            args.gamma,
-            resize_factor=0.25,
-            rescale_factor=0.25,
-            rescale_mode="bilinear",
-        ).to(device)
-    else:
-        consistency_criterion = None
-
-    if args.psnr_weight:
-        psnr_criterion = utils.PSNRLoss().to(device)
-    else:
-        psnr_criterion = None
-
-    if args.smoothness_weight:
-        smoothness_criterion = utils.SmoothnessLoss().to(device)
-    else:
-        smoothness_criterion = None
-
-    if args.photometric_weight:
-        photometric_criterion = utils.FlowPhotoMetricLoss(
-            ssim_weight=args.photometric_ssim_weight,
-            max_displacement_ratio=args.photometric_max_displacement_ratio,
-            ssim_use_padding=False,
-        ).to(device)
-    else:
-        photometric_criterion = None
-
-    for step in range(args.start_step + 1, args.total_iterations + 1):
-        data_blob = next(loader)
-        optimizer.zero_grad()
-
-        # unpack the data blob
-        image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob)
-        with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
-            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates)
-            # different models have different outputs, make sure we get the right ones for this task
-            disp_predictions = make_stereo_flow(disp_predictions, model_out_channels)
-            # should the architecture or training loop require it, we have to adjust the disparity mask
-            # target to possibly look like an optical flow mask
-            disp_mask = make_stereo_flow(disp_mask, model_out_channels)
-            # sequence loss on top of the model outputs
-
-        loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight
-
-        if args.consistency_weight > 0:
-            loss_consistency = consistency_criterion(disp_predictions)
-            loss += loss_consistency * args.consistency_weight
-
-        if args.psnr_weight > 0:
-            loss_psnr = 0.0
-            for pred in disp_predictions:
-                # predictions might have 2 channels
-                loss_psnr += psnr_criterion(
-                    pred * valid_disp_mask.unsqueeze(1),
-                    disp_mask * valid_disp_mask.unsqueeze(1),
-                ).mean()  # mean the psnr loss over the batch
-            loss += loss_psnr / len(disp_predictions) * args.psnr_weight
-
-        if args.photometric_weight > 0:
-            loss_photometric = 0.0
-            for pred in disp_predictions:
-                # predictions might have 1 channel, therefore we need to inpute 0s for the second channel
-                if model_out_channels == 1:
-                    pred = torch.cat([pred, torch.zeros_like(pred)], dim=1)
-
-                loss_photometric += photometric_criterion(
-                    image_left, image_right, pred, valid_disp_mask
-                )  # photometric loss already comes out meaned over the batch
-            loss += loss_photometric / len(disp_predictions) * args.photometric_weight
-
-        if args.smoothness_weight > 0:
-            loss_smoothness = 0.0
-            for pred in disp_predictions:
-                # predictions might have 2 channels
-                loss_smoothness += smoothness_criterion(
-                    image_left, pred[:, :1, :, :]
-                ).mean()  # mean the smoothness loss over the batch
-            loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight
-
-        with torch.no_grad():
-            metrics, _ = utils.compute_metrics(
-                disp_predictions[-1][:, :1, :, :],  # predictions might have 2 channels
-                disp_mask[:, :1, :, :],  # so does the ground truth
-                valid_disp_mask,
-                args.metrics,
-            )
-
-        metrics.pop("fl-all", None)
-        logger.update(loss=loss, **metrics)
-
-        if scaler is not None:
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
-            if args.clip_grad_norm:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
-            scaler.step(optimizer)
-            scaler.update()
-        else:
-            loss.backward()
-            if args.clip_grad_norm:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
-            optimizer.step()
-
-        scheduler.step()
-
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            if writer is not None and step % args.tensorboard_log_frequency == 0:
-                # log the loss and metrics to tensorboard
-
-                writer.add_scalar("loss", loss, step)
-                for name, value in logger.meters.items():
-                    writer.add_scalar(name, value.avg, step)
-                # log the images to tensorboard
-                pred_grid = vizualization.make_training_sample_grid(
-                    image_left, image_right, disp_mask, valid_disp_mask, disp_predictions
-                )
-                writer.add_image("predictions", pred_grid, step, dataformats="HWC")
-
-                # second thing we want to see is how relevant the iterative refinement is
-                pred_sequence_grid = vizualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
-                writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC")
-
-        if step % args.save_frequency == 0:
-            if not args.distributed or args.rank == 0:
-                model_without_ddp = (
-                    model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
-                )
-                checkpoint = {
-                    "model": model_without_ddp.state_dict(),
-                    "optimizer": optimizer.state_dict(),
-                    "scheduler": scheduler.state_dict(),
-                    "step": step,
-                    "args": args,
-                }
-                os.makedirs(args.checkpoint_dir, exist_ok=True)
-                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
-                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
-
-        if step % args.valid_frequency == 0:
-            evaluate(model, val_loaders, args, writer, step)
-            model.train()
-            if args.freeze_batch_norm:
-                if isinstance(model, nn.parallel.DistributedDataParallel):
-                    freeze_batch_norm(model.module)
-                else:
-                    freeze_batch_norm(model)
-
-    # one final save at the end
-    if not args.distributed or args.rank == 0:
-        model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
-        checkpoint = {
-            "model": model_without_ddp.state_dict(),
-            "optimizer": optimizer.state_dict(),
-            "scheduler": scheduler.state_dict(),
-            "step": step,
-            "args": args,
-        }
-        os.makedirs(args.checkpoint_dir, exist_ok=True)
-        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
-        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
-
-
-def main(args):
-    args.total_iterations = sum(args.dataset_steps)
-
-    # intialize DDP setting
-    utils.setup_ddp(args)
-    print(args)
-
-    args.test_only = args.train_datasets is None
-
-    # set the appropiate devices
-    if args.distributed and args.device == "cpu":
-        raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
-    device = torch.device(args.device)
-
-    # select model architecture
-    model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
-
-    # convert to DDP if need be
-    if args.distributed:
-        model = model.to(args.gpu)
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-        model_without_ddp = model.module
-    else:
-        model.to(device)
-        model_without_ddp = model
-
-    os.makedirs(args.checkpoint_dir, exist_ok=True)
-
-    val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets}
-
-    # EVAL ONLY configurations
-    if args.test_only:
-        evaluate(model, val_loaders, args)
-        return
-
-    # Sanity check for the parameter count
-    print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
-
-    # Compose the training dataset
-    train_dataset = get_train_dataset(args.dataset_root, args)
-
-    # initialize the optimizer
-    if args.optimizer == "adam":
-        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
-    elif args.optimizer == "sgd":
-        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9)
-    else:
-        raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd")
-
-    # initialize the learning rate schedule
-    scheduler = make_lr_schedule(args, optimizer)
-
-    # load them from checkpoint if need
-    args.start_step = 0
-    if args.resume_path is not None:
-        checkpoint = torch.load(args.resume_path, map_location="cpu")
-        if "model" in checkpoint:
-            # this means the user requested to resume from a training checkpoint
-            model_without_ddp.load_state_dict(checkpoint["model"])
-            # this means the user wants to continue training from where it was left off
-            if args.resume_schedule:
-                optimizer.load_state_dict(checkpoint["optimizer"])
-                scheduler.load_state_dict(checkpoint["scheduler"])
-                args.start_step = checkpoint["step"] + 1
-                # modify starting point of the dat
-                sample_start_step = args.start_step * args.batch_size * args.world_size
-                train_dataset = train_dataset[sample_start_step:]
-
-        else:
-            # this means the user wants to finetune on top of a model state dict
-            # and that no other changes are required
-            model_without_ddp.load_state_dict(checkpoint)
-
-    torch.backends.cudnn.benchmark = True
-
-    # enable training mode
-    model.train()
-    if args.freeze_batch_norm:
-        freeze_batch_norm(model_without_ddp)
-
-    # put dataloader on top of the dataset
-    # make sure to disable shuffling since the dataset is already shuffled
-    # in order to guarantee quasi randomness whilst retaining a deterministic
-    # dataset consumption order
-    if args.distributed:
-        # the train dataset is preshuffled in order to respect the iteration order
-        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True)
-    else:
-        # the train dataset is already shuffled so we can use a simple SequentialSampler
-        sampler = torch.utils.data.SequentialSampler(train_dataset)
-
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset,
-        sampler=sampler,
-        batch_size=args.batch_size,
-        pin_memory=True,
-        num_workers=args.workers,
-    )
-
-    # intialize the logger
-    if args.tensorboard_summaries:
-        from torch.utils.tensorboard import SummaryWriter
-
-        tensorboard_path = Path(args.checkpoint_dir) / "tensorboard"
-        os.makedirs(tensorboard_path, exist_ok=True)
-
-        tensorboard_run = tensorboard_path / f"{args.name}"
-        writer = SummaryWriter(tensorboard_run)
-    else:
-        writer = None
-
-    logger = utils.MetricLogger(delimiter="  ")
-
-    scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None
-    # run the training loop
-    # this will perform optimization, respectively logging and saving checkpoints
-    # when need be
-    run(
-        model=model,
-        optimizer=optimizer,
-        scheduler=scheduler,
-        train_loader=train_loader,
-        val_loaders=val_loaders,
-        logger=logger,
-        writer=writer,
-        scaler=scaler,
-        args=args,
-    )
-
-
-def get_args_parser(add_help=True):
-    import argparse
-
-    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help)
-    # checkpointing
-    parser.add_argument("--name", default="crestereo", help="name of the experiment")
-    parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume")
-    parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory")
-
-    # dataset
-    parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory")
-    parser.add_argument(
-        "--train-datasets",
-        type=str,
-        nargs="+",
-        default=["crestereo"],
-        help="dataset(s) to train on",
-        choices=list(VALID_DATASETS.keys()),
-    )
-    parser.add_argument(
-        "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset"
-    )
-    parser.add_argument(
-        "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs"
-    )
-    parser.add_argument(
-        "--test-datasets",
-        type=str,
-        nargs="+",
-        default=["middlebury2014-train"],
-        help="dataset(s) to test on",
-        choices=["middlebury2014-train"],
-    )
-    parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True)
-    parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True)
-    parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU")
-    parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU")
-    parser.add_argument(
-        "--threads",
-        type=int,
-        default=16,
-        help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.",
-    )
-
-    # model architecture
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="crestereo_base",
-        help="model architecture",
-        choices=["crestereo_base", "raft_stereo"],
-    )
-    parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates")
-    parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters")
-
-    # loss parameters
-    parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss")
-    parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss")
-    parser.add_argument(
-        "--flow-loss-exclude-large",
-        action="store_true",
-        help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm",
-        default=False,
-    )
-    parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight")
-    parser.add_argument(
-        "--consistency-resize-factor",
-        type=float,
-        default=0.25,
-        help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image",
-    )
-    parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight")
-    parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight")
-    parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight")
-    parser.add_argument(
-        "--photometric-max-displacement-ratio",
-        type=float,
-        default=0.15,
-        help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss",
-    )
-    parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight")
-
-    # transforms parameters
-    parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms")
-    parser.add_argument(
-        "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation"
-    )
-    parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size")
-    parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size")
-    parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range")
-    parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image")
-    parser.add_argument(
-        "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"]
-    )
-    parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image")
-    parser.add_argument(
-        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
-    )
-    parser.add_argument(
-        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
-    )
-    parser.add_argument(
-        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
-    )
-    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
-    parser.add_argument(
-        "--interpolation-strategy",
-        type=str,
-        default="bilinear",
-        help="interpolation strategy",
-        choices=["bilinear", "bicubic", "mixed"],
-    )
-    parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image")
-    parser.add_argument(
-        "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift"
-    )
-    parser.add_argument(
-        "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift"
-    )
-    parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction")
-    parser.add_argument(
-        "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction"
-    )
-    parser.add_argument(
-        "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction"
-    )
-    parser.add_argument(
-        "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction"
-    )
-    parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction")
-    parser.add_argument(
-        "--asymmetric-jitter-prob",
-        type=float,
-        default=1.0,
-        help="probability of using asymmetric jitter instead of symmetric jitter",
-    )
-    parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage")
-    parser.add_argument(
-        "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels"
-    )
-    parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images")
-    parser.add_argument(
-        "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels"
-    )
-    parser.add_argument(
-        "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation"
-    )
-
-    # optimizer parameters
-    parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"])
-    parser.add_argument("--lr", type=float, default=4e-4, help="learning rate")
-    parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay")
-    parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm")
-
-    # lr_scheduler parameters
-    parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate")
-    parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps")
-    parser.add_argument(
-        "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr"
-    )
-    parser.add_argument(
-        "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"]
-    )
-    parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate")
-    parser.add_argument(
-        "--lr-decay-method",
-        type=str,
-        default="linear",
-        help="decay method",
-        choices=["linear", "cosine", "exponential"],
-    )
-    parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate")
-
-    # deterministic behaviour
-    parser.add_argument("--seed", type=int, default=42, help="seed for random number generators")
-
-    # mixed precision training
-    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
-
-    # logging
-    parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard")
-    parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency")
-    parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency")
-    parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency")
-    parser.add_argument(
-        "--metrics",
-        type=str,
-        nargs="+",
-        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
-        help="metrics to log",
-        choices=AVAILABLE_METRICS,
-    )
-
-    # distributed parameters
-    parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes")
-    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
-    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
-
-    # weights API
-    parser.add_argument("--weights", type=str, default=None, help="weights API url")
-    parser.add_argument(
-        "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning"
-    )
-    parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state")
-
-    # padder parameters
-    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
-    return parser
-
-
-if __name__ == "__main__":
-    args = get_args_parser().parse_args()
-    main(args)
diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py
deleted file mode 100644
index 10c5be68737..00000000000
--- a/references/depth/stereo/transforms.py
+++ /dev/null
@@ -1,646 +0,0 @@
-import random
-from typing import Callable, List, Optional, Sequence, Tuple, Union
-
-import numpy as np
-import PIL.Image
-import torch
-import torchvision.transforms as T
-import torchvision.transforms.functional as F
-from torch import Tensor
-
-T_FLOW = Union[Tensor, np.ndarray, None]
-T_MASK = Union[Tensor, np.ndarray, None]
-T_STEREO_TENSOR = Tuple[Tensor, Tensor]
-T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]]
-
-
-def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor:
-    return (low - high) * torch.rand(size) + high
-
-
-class InterpolationStrategy:
-
-    _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"]
-
-    def __init__(self, mode: str = "mixed") -> None:
-        if mode not in self._valid_modes:
-            raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}")
-
-        if mode == "mixed":
-            self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC]
-        elif mode == "bicubic":
-            self.strategies = [F.InterpolationMode.BICUBIC]
-        elif mode == "bilinear":
-            self.strategies = [F.InterpolationMode.BILINEAR]
-
-    def __call__(self) -> F.InterpolationMode:
-        return random.choice(self.strategies)
-
-    @classmethod
-    def is_valid(mode: str) -> bool:
-        return mode in InterpolationStrategy._valid_modes
-
-    @property
-    def valid_modes() -> List[str]:
-        return InterpolationStrategy._valid_modes
-
-
-class ValidateModelInput(torch.nn.Module):
-    # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects
-    def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK):
-        if images[0].shape != images[1].shape:
-            raise ValueError("img1 and img2 should have the same shape.")
-        h, w = images[0].shape[-2:]
-        if disparities[0] is not None and disparities[0].shape != (1, h, w):
-            raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}")
-        if masks[0] is not None:
-            if masks[0].shape != (h, w):
-                raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}")
-            if masks[0].dtype != torch.bool:
-                raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}")
-
-        return images, disparities, masks
-
-
-class ConvertToGrayscale(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def forward(
-        self,
-        images: Tuple[PIL.Image.Image, PIL.Image.Image],
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        img_left = F.rgb_to_grayscale(images[0], num_output_channels=3)
-        img_right = F.rgb_to_grayscale(images[1], num_output_channels=3)
-
-        return (img_left, img_right), disparities, masks
-
-
-class MakeValidDisparityMask(torch.nn.Module):
-    def __init__(self, max_disparity: Optional[int] = 256) -> None:
-        super().__init__()
-        self.max_disparity = max_disparity
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        valid_masks = tuple(
-            torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask
-            for idx, mask in enumerate(masks)
-        )
-
-        valid_masks = tuple(
-            torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask
-            for mask, disparity in zip(valid_masks, disparities)
-        )
-
-        if self.max_disparity is not None:
-            valid_masks = tuple(
-                torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask
-                for mask, disparity in zip(valid_masks, disparities)
-            )
-
-        return images, disparities, valid_masks
-
-
-class ToGPU(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        dev_images = tuple(image.cuda() for image in images)
-        dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities))
-        dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks))
-        return dev_images, dev_disparities, dev_masks
-
-
-class ConvertImageDtype(torch.nn.Module):
-    def __init__(self, dtype: torch.dtype):
-        super().__init__()
-        self.dtype = dtype
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        img_left = F.convert_image_dtype(images[0], dtype=self.dtype)
-        img_right = F.convert_image_dtype(images[1], dtype=self.dtype)
-
-        img_left = img_left.contiguous()
-        img_right = img_right.contiguous()
-
-        return (img_left, img_right), disparities, masks
-
-
-class Normalize(torch.nn.Module):
-    def __init__(self, mean: List[float], std: List[float]) -> None:
-        super().__init__()
-        self.mean = mean
-        self.std = std
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        img_left = F.normalize(images[0], mean=self.mean, std=self.std)
-        img_right = F.normalize(images[1], mean=self.mean, std=self.std)
-
-        img_left = img_left.contiguous()
-        img_right = img_right.contiguous()
-
-        return (img_left, img_right), disparities, masks
-
-
-class ToTensor(torch.nn.Module):
-    def forward(
-        self,
-        images: Tuple[PIL.Image.Image, PIL.Image.Image],
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        if images[0] is None:
-            raise ValueError("img_left is None")
-        if images[1] is None:
-            raise ValueError("img_right is None")
-
-        img_left = F.pil_to_tensor(images[0])
-        img_right = F.pil_to_tensor(images[1])
-        disparity_tensors = ()
-        mask_tensors = ()
-
-        for idx in range(2):
-            disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,)
-            mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,)
-
-        return (img_left, img_right), disparity_tensors, mask_tensors
-
-
-class AsymmetricColorJitter(T.ColorJitter):
-    # p determines the probability of doing asymmetric vs symmetric color jittering
-    def __init__(
-        self,
-        brightness: T_COLOR_AUG_PARAM = 0,
-        contrast: T_COLOR_AUG_PARAM = 0,
-        saturation: T_COLOR_AUG_PARAM = 0,
-        hue: T_COLOR_AUG_PARAM = 0,
-        p: float = 0.2,
-    ):
-        super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
-        self.p = p
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        if torch.rand(1) < self.p:
-            # asymmetric: different transform for img1 and img2
-            img_left = super().forward(images[0])
-            img_right = super().forward(images[1])
-        else:
-            # symmetric: same transform for img1 and img2
-            batch = torch.stack(images)
-            batch = super().forward(batch)
-            img_left, img_right = batch[0], batch[1]
-
-        return (img_left, img_right), disparities, masks
-
-
-class AsymetricGammaAdjust(torch.nn.Module):
-    def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None:
-        super().__init__()
-        self.gamma_range = gamma_range
-        self.gain = gain
-        self.p = p
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item()
-
-        if torch.rand(1) < self.p:
-            # asymmetric: different transform for img1 and img2
-            img_left = F.adjust_gamma(images[0], gamma, gain=self.gain)
-            img_right = F.adjust_gamma(images[1], gamma, gain=self.gain)
-        else:
-            # symmetric: same transform for img1 and img2
-            batch = torch.stack(images)
-            batch = F.adjust_gamma(batch, gamma, gain=self.gain)
-            img_left, img_right = batch[0], batch[1]
-
-        return (img_left, img_right), disparities, masks
-
-
-class RandomErase(torch.nn.Module):
-    # Produces multiple symetric random erasures
-    # these can be viewed as occlusions present in both camera views.
-    # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map
-    def __init__(
-        self,
-        p: float = 0.5,
-        erase_px_range: Tuple[int, int] = (50, 100),
-        value: Union[Tensor, float] = 0,
-        inplace: bool = False,
-        max_erase: int = 2,
-    ):
-        super().__init__()
-        self.min_px_erase = erase_px_range[0]
-        self.max_px_erase = erase_px_range[1]
-        if self.max_px_erase < 0:
-            raise ValueError("erase_px_range[1] should be equal or greater than 0")
-        if self.min_px_erase < 0:
-            raise ValueError("erase_px_range[0] should be equal or greater than 0")
-        if self.min_px_erase > self.max_px_erase:
-            raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]")
-
-        self.p = p
-        self.value = value
-        self.inplace = inplace
-        self.max_erase = max_erase
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: T_STEREO_TENSOR,
-        masks: T_STEREO_TENSOR,
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        if torch.rand(1) < self.p:
-            return images, disparities, masks
-
-        image_left, image_right = images
-        mask_left, mask_right = masks
-        for _ in range(torch.randint(self.max_erase, size=(1,)).item()):
-            y, x, h, w, v = self._get_params(image_left)
-            image_right = F.erase(image_right, y, x, h, w, v, self.inplace)
-            image_left = F.erase(image_left, y, x, h, w, v, self.inplace)
-            # similarly to optical flow occlusion prediction, we consider
-            # any erasure pixels that are in both images to be occluded therefore
-            # we mark them as invalid
-            if mask_left is not None:
-                mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace)
-            if mask_right is not None:
-                mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace)
-
-        return (image_left, image_right), disparities, (mask_left, mask_right)
-
-    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
-        img_h, img_w = img.shape[-2:]
-        crop_h, crop_w = (
-            random.randint(self.min_px_erase, self.max_px_erase),
-            random.randint(self.min_px_erase, self.max_px_erase),
-        )
-        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
-
-        return crop_y, crop_x, crop_h, crop_w, self.value
-
-
-class RandomOcclusion(torch.nn.Module):
-    # This adds an occlusion in the right image
-    # the occluded patch works as a patch erase where the erase value is the mean
-    # of the pixels from the selected zone
-    def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False):
-        super().__init__()
-
-        self.min_px_occlusion = occlusion_px_range[0]
-        self.max_px_occlusion = occlusion_px_range[1]
-
-        if self.max_px_occlusion < 0:
-            raise ValueError("occlusion_px_range[1] should be greater or equal than 0")
-        if self.min_px_occlusion < 0:
-            raise ValueError("occlusion_px_range[0] should be greater or equal than 0")
-        if self.min_px_occlusion > self.max_px_occlusion:
-            raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]")
-
-        self.p = p
-        self.inplace = inplace
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: T_STEREO_TENSOR,
-        masks: T_STEREO_TENSOR,
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        left_image, right_image = images
-
-        if torch.rand(1) < self.p:
-            return images, disparities, masks
-
-        y, x, h, w, v = self._get_params(right_image)
-        right_image = F.erase(right_image, y, x, h, w, v, self.inplace)
-
-        return ((left_image, right_image), disparities, masks)
-
-    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
-        img_h, img_w = img.shape[-2:]
-        crop_h, crop_w = (
-            random.randint(self.min_px_occlusion, self.max_px_occlusion),
-            random.randint(self.min_px_occlusion, self.max_px_occlusion),
-        )
-
-        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
-        occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True)
-
-        return (crop_y, crop_x, crop_h, crop_w, occlusion_value)
-
-
-class RandomSpatialShift(torch.nn.Module):
-    # This transform applies a vertical shift and a slight angle rotation and the same time
-    def __init__(
-        self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear"
-    ) -> None:
-        super().__init__()
-        self.p = p
-        self.max_angle = max_angle
-        self.max_px_shift = max_px_shift
-        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: T_STEREO_TENSOR,
-        masks: T_STEREO_TENSOR,
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        # the transform is applied only on the right image
-        # in order to mimic slight calibration issues
-        img_left, img_right = images
-
-        INTERP_MODE = self._interpolation_mode_strategy()
-
-        if torch.rand(1) < self.p:
-            # [0, 1] -> [-a, a]
-            shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item()
-            angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item()
-            # sample center point for the rotation matrix
-            y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item()
-            x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item()
-            # apply affine transformations
-            img_right = F.affine(
-                img_right,
-                angle=angle,
-                translate=[0, shift],  # translation only on the y axis
-                center=[x, y],
-                scale=1.0,
-                shear=0.0,
-                interpolation=INTERP_MODE,
-            )
-
-        return ((img_left, img_right), disparities, masks)
-
-
-class RandomHorizontalFlip(torch.nn.Module):
-    def __init__(self, p: float = 0.5) -> None:
-        super().__init__()
-        self.p = p
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        img_left, img_right = images
-        dsp_left, dsp_right = disparities
-        mask_left, mask_right = masks
-
-        if dsp_right is not None and torch.rand(1) < self.p:
-            img_left, img_right = F.hflip(img_left), F.hflip(img_right)
-            dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right)
-            if mask_left is not None and mask_right is not None:
-                mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right)
-            return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left))
-
-        return images, disparities, masks
-
-
-class Resize(torch.nn.Module):
-    def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None:
-        super().__init__()
-        self.resize_size = list(resize_size)  # doing this to keep mypy happy
-        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        resized_images = ()
-        resized_disparities = ()
-        resized_masks = ()
-
-        INTERP_MODE = self._interpolation_mode_strategy()
-
-        for img in images:
-            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE),)
-
-        for dsp in disparities:
-            if dsp is not None:
-                # rescale disparity to match the new image size
-                scale_x = self.resize_size[1] / dsp.shape[-1]
-                resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,)
-            else:
-                resized_disparities += (None,)
-
-        for mask in masks:
-            if mask is not None:
-                resized_masks += (
-                    # we squeeze and unsqueeze because the API requires > 3D tensors
-                    F.resize(
-                        mask.unsqueeze(0),
-                        self.resize_size,
-                        interpolation=F.InterpolationMode.NEAREST,
-                    ).squeeze(0),
-                )
-            else:
-                resized_masks += (None,)
-
-        return resized_images, resized_disparities, resized_masks
-
-
-class RandomRescaleAndCrop(torch.nn.Module):
-    # This transform will resize the input with a given proba, and then crop it.
-    # These are the reversed operations of the built-in RandomResizedCrop,
-    # although the order of the operations doesn't matter too much: resizing a
-    # crop would give the same result as cropping a resized image, up to
-    # interpolation artifact at the borders of the output.
-    #
-    # The reason we don't rely on RandomResizedCrop is because of a significant
-    # difference in the parametrization of both transforms, in particular,
-    # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
-    # https://github.com/pytorch/vision/pull/5026/files#r762932579
-    def __init__(
-        self,
-        crop_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (-0.2, 0.5),
-        rescale_prob: float = 0.8,
-        scaling_type: str = "exponential",
-        interpolation_type: str = "bilinear",
-    ) -> None:
-        super().__init__()
-        self.crop_size = crop_size
-        self.min_scale = scale_range[0]
-        self.max_scale = scale_range[1]
-        self.rescale_prob = rescale_prob
-        self.scaling_type = scaling_type
-        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
-
-        if self.scaling_type == "linear" and self.min_scale < 0:
-            raise ValueError("min_scale must be >= 0 for linear scaling")
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        img_left, img_right = images
-        dsp_left, dsp_right = disparities
-        mask_left, mask_right = masks
-        INTERP_MODE = self._interpolation_mode_strategy()
-
-        # randomly sample scale
-        h, w = img_left.shape[-2:]
-        # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti)
-        # It shouldn't matter much
-        min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w)
-
-        # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise
-        # 2 to the power of that random value. This final scale distribution will have a different
-        # mean and variance than a uniform distribution. Note that a scale of 1 will result in
-        # in a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
-        # of 0.5X the original size.
-        if self.scaling_type == "exponential":
-            scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
-        # linear scaling will draw a random scale in (min_scale, max_scale)
-        elif self.scaling_type == "linear":
-            scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
-
-        scale = max(scale, min_scale)
-
-        new_h, new_w = round(h * scale), round(w * scale)
-
-        if torch.rand(1).item() < self.rescale_prob:
-            # rescale the images
-            img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE)
-            img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE)
-
-            resized_masks, resized_disparities = (), ()
-
-            for disparity, mask in zip(disparities, masks):
-                if disparity is not None:
-                    if mask is None:
-                        resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE)
-                        # rescale the disparity
-                        resized_disparity = (
-                            resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None]
-                        )
-                        resized_mask = None
-                    else:
-                        resized_disparity, resized_mask = _resize_sparse_flow(
-                            disparity, mask, scale_x=scale, scale_y=scale
-                        )
-                resized_masks += (resized_mask,)
-                resized_disparities += (resized_disparity,)
-
-        else:
-            resized_disparities = disparities
-            resized_masks = masks
-
-        disparities = resized_disparities
-        masks = resized_masks
-
-        # Note: For sparse datasets (Kitti), the original code uses a "margin"
-        # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
-        y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item()
-        x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item()
-
-        img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1])
-        img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1])
-        if dsp_left is not None:
-            dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1])
-        if dsp_right is not None:
-            dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1])
-
-        cropped_masks = ()
-        for mask in masks:
-            if mask is not None:
-                mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1])
-            cropped_masks += (mask,)
-
-        return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks)
-
-
-def _resize_sparse_flow(
-    flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0
-) -> Tuple[Tensor, Tensor]:
-    # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse)
-    # There are as-many non-zero values in the original flow as in the resized flow (up to OOB)
-    # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4
-
-    h, w = flow.shape[-2:]
-
-    h_new = int(round(h * scale_y))
-    w_new = int(round(w * scale_x))
-    flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype)
-    valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype)
-
-    jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy")
-
-    ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask]
-
-    ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long)
-    jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long)
-
-    within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new)
-
-    ii_valid = ii_valid[within_bounds_mask]
-    jj_valid = jj_valid[within_bounds_mask]
-    ii_valid_new = ii_valid_new[within_bounds_mask]
-    jj_valid_new = jj_valid_new[within_bounds_mask]
-
-    valid_flow_new = flow[:, ii_valid, jj_valid]
-    valid_flow_new *= scale_x
-
-    flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new
-    valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid]
-
-    return flow_new, valid_new.bool()
-
-
-class Compose(torch.nn.Module):
-    def __init__(self, transforms: List[Callable]):
-        super().__init__()
-        self.transforms = transforms
-
-    @torch.inference_mode()
-    def forward(self, images, disparities, masks):
-        for t in self.transforms:
-            images, disparities, masks = t(images, disparities, masks)
-        return images, disparities, masks
diff --git a/references/depth/stereo/utils/__init__.py b/references/depth/stereo/utils/__init__.py
deleted file mode 100644
index 4dacbe61ba0..00000000000
--- a/references/depth/stereo/utils/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .losses import *
-from .metrics import *
-from .distributed import *
-from .logger import *
-from .padder import *
-from .norm import *
diff --git a/references/depth/stereo/utils/distributed.py b/references/depth/stereo/utils/distributed.py
deleted file mode 100644
index 228aa2a0f9a..00000000000
--- a/references/depth/stereo/utils/distributed.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-
-import torch
-import torch.distributed as dist
-
-
-def _redefine_print(is_main):
-    """disables printing when not in main process"""
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_main or force:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def setup_ddp(args):
-    # Set the local_rank, rank, and world_size values as args fields
-    # This is done differently depending on how we're running the script. We
-    # currently support either torchrun or the custom run_with_submitit.py
-    # If you're confused (like I was), this might help a bit
-    # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2
-
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ["WORLD_SIZE"])
-        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
-    elif hasattr(args, "rank"):
-        pass
-    else:
-        print("Not using distributed mode")
-        args.distributed = False
-        args.world_size = 1
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    dist.init_process_group(
-        backend="nccl",
-        rank=args.rank,
-        world_size=args.world_size,
-        init_method=args.dist_url,
-    )
-    torch.distributed.barrier()
-    _redefine_print(is_main=(args.rank == 0))
-
-
-def reduce_across_processes(val):
-    t = torch.tensor(val, device="cuda")
-    dist.barrier()
-    dist.all_reduce(t)
-    return t
diff --git a/references/depth/stereo/utils/logger.py b/references/depth/stereo/utils/logger.py
deleted file mode 100644
index 803e9aebd7b..00000000000
--- a/references/depth/stereo/utils/logger.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import datetime
-import time
-from collections import defaultdict, deque
-
-import torch
-
-from .distributed import reduce_across_processes
-
-
-class SmoothedValue:
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt="{median:.4f} ({global_avg:.4f})"):
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        t = reduce_across_processes([self.count, self.total])
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    @property
-    def max(self):
-        return max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
-        )
-
-
-class MetricLogger:
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            if not isinstance(v, (float, int)):
-                raise TypeError(
-                    f"This method expects the value of the input arguments to be of type float or int, instead  got {type(v)}"
-                )
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append(f"{name}: {str(meter)}")
-        return self.delimiter.join(loss_str)
-
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-
-    def add_meter(self, name, **kwargs):
-        self.meters[name] = SmoothedValue(**kwargs)
-
-    def log_every(self, iterable, print_freq=5, header=None):
-        i = 0
-        if not header:
-            header = ""
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt="{avg:.4f}")
-        data_time = SmoothedValue(fmt="{avg:.4f}")
-        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
-        if torch.cuda.is_available():
-            log_msg = self.delimiter.join(
-                [
-                    header,
-                    "[{0" + space_fmt + "}/{1}]",
-                    "eta: {eta}",
-                    "{meters}",
-                    "time: {time}",
-                    "data: {data}",
-                    "max mem: {memory:.0f}",
-                ]
-            )
-        else:
-            log_msg = self.delimiter.join(
-                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
-            )
-        MB = 1024.0 * 1024.0
-        for obj in iterable:
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if print_freq is not None and i % print_freq == 0:
-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                if torch.cuda.is_available():
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                            memory=torch.cuda.max_memory_allocated() / MB,
-                        )
-                    )
-                else:
-                    print(
-                        log_msg.format(
-                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
-                        )
-                    )
-            i += 1
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print(f"{header} Total time: {total_time_str}")
diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py
deleted file mode 100644
index f950f1745ec..00000000000
--- a/references/depth/stereo/utils/losses.py
+++ /dev/null
@@ -1,503 +0,0 @@
-from typing import List, Optional
-
-import torch
-from torch import nn, Tensor
-from torch.nn import functional as F
-from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid
-
-
-def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor:
-    """Function to create a 2D Gaussian kernel."""
-
-    x = torch.arange(kernel_size, dtype=torch.float32)
-    y = torch.arange(kernel_size, dtype=torch.float32)
-    x = x - (kernel_size - 1) / 2
-    y = y - (kernel_size - 1) / 2
-    x, y = torch.meshgrid(x, y)
-    grid = (x**2 + y**2) / (2 * sigma**2)
-    kernel = torch.exp(-grid)
-    kernel = kernel / kernel.sum()
-    return kernel
-
-
-def _sequence_loss_fn(
-    flow_preds: List[Tensor],
-    flow_gt: Tensor,
-    valid_flow_mask: Optional[Tensor],
-    gamma: Tensor,
-    max_flow: int = 256,
-    exclude_large: bool = False,
-    weights: Optional[Tensor] = None,
-):
-    """Loss function defined over sequence of flow predictions"""
-    torch._assert(
-        gamma < 1,
-        "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma),
-    )
-
-    if exclude_large:
-        # exclude invalid pixels and extremely large diplacements
-        flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
-        if valid_flow_mask is not None:
-            valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
-        else:
-            valid_flow_mask = flow_norm < max_flow
-
-    if valid_flow_mask is not None:
-        valid_flow_mask = valid_flow_mask.unsqueeze(1)
-    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
-
-    abs_diff = (flow_preds - flow_gt).abs()
-    if valid_flow_mask is not None:
-        abs_diff = abs_diff * valid_flow_mask.unsqueeze(0)
-
-    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
-    num_predictions = flow_preds.shape[0]
-
-    # alocating on CPU and moving to device during run-time can force
-    # an unwanted GPU synchronization that produces a large overhead
-    if weights is None or len(weights) != num_predictions:
-        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
-
-    flow_loss = (abs_diff * weights).sum()
-    return flow_loss, weights
-
-
-class SequenceLoss(nn.Module):
-    def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None:
-        """
-        Args:
-            gamma: value for the exponential weighting of the loss across frames
-            max_flow: maximum flow value to exclude
-            exclude_large_flows: whether to exclude large flows
-        """
-
-        super().__init__()
-        self.max_flow = max_flow
-        self.excluding_large = exclude_large_flows
-        self.register_buffer("gamma", torch.tensor([gamma]))
-        # cache the scale factor for the loss
-        self._weights = None
-
-    def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor:
-        """
-        Args:
-            flow_preds: list of flow predictions of shape (batch_size, C, H, W)
-            flow_gt: ground truth flow of shape (batch_size, C, H, W)
-            valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W)
-        """
-        loss, weights = _sequence_loss_fn(
-            flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights
-        )
-        self._weights = weights
-        return loss
-
-    def set_gamma(self, gamma: float) -> None:
-        self.gamma.fill_(gamma)
-        # reset the cached scale factor
-        self._weights = None
-
-
-def _ssim_loss_fn(
-    source: Tensor,
-    reference: Tensor,
-    kernel: Tensor,
-    eps: float = 1e-8,
-    c1: float = 0.01**2,
-    c2: float = 0.03**2,
-    use_padding: bool = False,
-) -> Tensor:
-    # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity
-    # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim
-
-    torch._assert(
-        source.ndim == reference.ndim == 4,
-        "SSIM: `source` and `reference` must be 4-dimensional tensors",
-    )
-
-    torch._assert(
-        source.shape == reference.shape,
-        "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format(
-            source.shape, reference.shape
-        ),
-    )
-
-    B, C, H, W = source.shape
-    kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1)
-    if use_padding:
-        pad_size = kernel.shape[2] // 2
-        source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect")
-        reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect")
-
-    mu1 = F.conv2d(source, kernel, groups=C)
-    mu2 = F.conv2d(reference, kernel, groups=C)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-
-    mu1_mu2 = mu1 * mu2
-    mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C)
-    mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C)
-    mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C)
-
-    sigma1_sq = mu_img1_sq - mu1_sq
-    sigma2_sq = mu_img2_sq - mu2_sq
-    sigma12 = mu_img1_mu2 - mu1_mu2
-
-    numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
-    denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
-    ssim = numerator / (denominator + eps)
-
-    # doing 1 - ssim because we want to maximize the ssim
-    return 1 - ssim.mean(dim=(1, 2, 3))
-
-
-class SSIM(nn.Module):
-    def __init__(
-        self,
-        kernel_size: int = 11,
-        max_val: float = 1.0,
-        sigma: float = 1.5,
-        eps: float = 1e-12,
-        use_padding: bool = True,
-    ) -> None:
-        """SSIM loss function.
-
-        Args:
-            kernel_size: size of the Gaussian kernel
-            max_val: constant scaling factor
-            sigma: sigma of the Gaussian kernel
-            eps: constant for division by zero
-            use_padding: whether to pad the input tensor such that we have a score for each pixel
-        """
-        super().__init__()
-
-        self.kernel_size = kernel_size
-        self.max_val = max_val
-        self.sigma = sigma
-
-        gaussian_kernel = make_gaussian_kernel(kernel_size, sigma)
-        self.register_buffer("gaussian_kernel", gaussian_kernel)
-
-        self.c1 = (0.01 * self.max_val) ** 2
-        self.c2 = (0.03 * self.max_val) ** 2
-
-        self.use_padding = use_padding
-        self.eps = eps
-
-    def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            source: source image of shape (batch_size, C, H, W)
-            reference: reference image of shape (batch_size, C, H, W)
-
-        Returns:
-            SSIM loss of shape (batch_size,)
-        """
-        return _ssim_loss_fn(
-            source,
-            reference,
-            kernel=self.gaussian_kernel,
-            c1=self.c1,
-            c2=self.c2,
-            use_padding=self.use_padding,
-            eps=self.eps,
-        )
-
-
-def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor):
-    # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202
-
-    torch._assert(
-        img_gx.ndim >= 3,
-        "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)",
-    )
-
-    torch._assert(
-        img_gx.ndim == val_gx.ndim,
-        "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format(
-            img_gx.ndim, val_gx.ndim
-        ),
-    )
-
-    for idx in range(img_gx.ndim):
-        torch._assert(
-            (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)),
-            "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format(
-                img_gx.shape, val_gx.shape
-            ),
-        )
-
-    # -3 is channel dimension
-    weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True))
-    weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True))
-
-    smoothness_x = img_gx * weights_x
-    smoothness_y = img_gy * weights_y
-
-    smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1))
-    return smoothness
-
-
-class SmoothnessLoss(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def _x_gradient(self, img: Tensor) -> Tensor:
-        if img.ndim > 4:
-            original_shape = img.shape
-            is_reshaped = True
-            img = img.reshape(-1, *original_shape[-3:])
-        else:
-            is_reshaped = False
-
-        padded = F.pad(img, (0, 1, 0, 0), mode="replicate")
-        grad = padded[..., :, :-1] - padded[..., :, 1:]
-        if is_reshaped:
-            grad = grad.reshape(original_shape)
-        return grad
-
-    def _y_gradient(self, x: torch.Tensor) -> torch.Tensor:
-        if x.ndim > 4:
-            original_shape = x.shape
-            is_reshaped = True
-            x = x.reshape(-1, *original_shape[-3:])
-        else:
-            is_reshaped = False
-
-        padded = F.pad(x, (0, 0, 0, 1), mode="replicate")
-        grad = padded[..., :-1, :] - padded[..., 1:, :]
-        if is_reshaped:
-            grad = grad.reshape(original_shape)
-        return grad
-
-    def forward(self, images: Tensor, vals: Tensor) -> Tensor:
-        """
-        Args:
-            images: tensor of shape (D1, D2, ..., DN, C, H, W)
-            depths: tensor of shape (D1, D2, ..., DN, 1, H, W)
-
-        Returns:
-            smoothness loss of shape (D1, D2, ..., DN)
-        """
-        img_gx = self._x_gradient(images)
-        img_gy = self._y_gradient(images)
-
-        val_gx = self._x_gradient(vals)
-        val_gy = self._y_gradient(vals)
-
-        return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy)
-
-
-def _flow_sequence_consistency_loss_fn(
-    flow_preds: List[Tensor],
-    gamma: float = 0.8,
-    resize_factor: float = 0.25,
-    rescale_factor: float = 0.25,
-    rescale_mode: str = "bilinear",
-    weights: Optional[Tensor] = None,
-):
-    """Loss function defined over sequence of flow predictions"""
-
-    # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf
-    # In the original paper, an additional refinement network is used to refine a flow prediction.
-    # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update.
-    # which should be consistent with the previous step. In this implementation, we simplify the overall loss
-    # term and ignore left-right consistency loss or photometric loss which can be treated separetely.
-
-    torch._assert(
-        rescale_factor <= 1.0,
-        "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format(
-            rescale_factor
-        ),
-    )
-
-    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
-    N, B, C, H, W = flow_preds.shape
-
-    # rescale flow predictions to account for bilinear upsampling artifacts
-    if rescale_factor:
-        flow_preds = (
-            F.interpolate(
-                flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True
-            )
-        ) * rescale_factor
-        flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0)
-
-    # force the next prediction to be similar to the previous prediction
-    abs_diff = (flow_preds[1:] - flow_preds[:-1]).square()
-    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
-
-    num_predictions = flow_preds.shape[0] - 1  # because we are comparing differences
-    if weights is None or len(weights) != num_predictions:
-        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
-
-    flow_loss = (abs_diff * weights).sum()
-    return flow_loss, weights
-
-
-class FlowSequenceConsistencyLoss(nn.Module):
-    def __init__(
-        self,
-        gamma: float = 0.8,
-        resize_factor: float = 0.25,
-        rescale_factor: float = 0.25,
-        rescale_mode: str = "bilinear",
-    ) -> None:
-        super().__init__()
-        self.gamma = gamma
-        self.resize_factor = resize_factor
-        self.rescale_factor = rescale_factor
-        self.rescale_mode = rescale_mode
-        self._weights = None
-
-    def forward(self, flow_preds: List[Tensor]) -> Tensor:
-        """
-        Args:
-            flow_preds: list of tensors of shape (batch_size, C, H, W)
-
-        Returns:
-            sequence consistency loss of shape (batch_size,)
-        """
-        loss, weights = _flow_sequence_consistency_loss_fn(
-            flow_preds,
-            gamma=self.gamma,
-            resize_factor=self.resize_factor,
-            rescale_factor=self.rescale_factor,
-            rescale_mode=self.rescale_mode,
-            weights=self._weights,
-        )
-        self._weights = weights
-        return loss
-
-    def set_gamma(self, gamma: float) -> None:
-        self.gamma.fill_(gamma)
-        # reset the cached scale factor
-        self._weights = None
-
-
-def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor:
-    torch._assert(
-        source.shape == target.shape,
-        "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape),
-    )
-
-    # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
-    return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1))))
-
-
-class PSNRLoss(nn.Module):
-    def __init__(self, max_val: float = 256) -> None:
-        """
-        Args:
-            max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor.
-
-        """
-        super().__init__()
-        self.max_val = max_val
-
-    def forward(self, source: Tensor, target: Tensor) -> Tensor:
-        """
-        Args:
-            source: tensor of shape (D1, D2, ..., DN, C, H, W)
-            target: tensor of shape (D1, D2, ..., DN, C, H, W)
-
-        Returns:
-            psnr loss of shape (D1, D2, ..., DN)
-        """
-
-        # multiply by -1 as we want to maximize the psnr
-        return -1 * _psnr_loss_fn(source, target, self.max_val)
-
-
-class FlowPhotoMetricLoss(nn.Module):
-    def __init__(
-        self,
-        ssim_weight: float = 0.85,
-        ssim_window_size: int = 11,
-        ssim_max_val: float = 1.0,
-        ssim_sigma: float = 1.5,
-        ssim_eps: float = 1e-12,
-        ssim_use_padding: bool = True,
-        max_displacement_ratio: float = 0.15,
-    ) -> None:
-        super().__init__()
-
-        self._ssim_loss = SSIM(
-            kernel_size=ssim_window_size,
-            max_val=ssim_max_val,
-            sigma=ssim_sigma,
-            eps=ssim_eps,
-            use_padding=ssim_use_padding,
-        )
-
-        self._L1_weight = 1 - ssim_weight
-        self._SSIM_weight = ssim_weight
-        self._max_displacement_ratio = max_displacement_ratio
-
-    def forward(
-        self,
-        source: Tensor,
-        reference: Tensor,
-        flow_pred: Tensor,
-        valid_mask: Optional[Tensor] = None,
-    ):
-        """
-        Args:
-            source: tensor of shape (B, C, H, W)
-            reference: tensor of shape (B, C, H, W)
-            flow_pred: tensor of shape (B, 2, H, W)
-            valid_mask: tensor of shape (B, H, W) or None
-
-        Returns:
-            photometric loss of shape
-
-        """
-        torch._assert(
-            source.ndim == 4,
-            "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim),
-        )
-        torch._assert(
-            reference.ndim == source.ndim,
-            "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format(
-                source.ndim, reference.ndim
-            ),
-        )
-        torch._assert(
-            flow_pred.shape[1] == 2,
-            "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]),
-        )
-        torch._assert(
-            flow_pred.ndim == 4,
-            "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim),
-        )
-
-        B, C, H, W = source.shape
-        flow_channels = flow_pred.shape[1]
-
-        max_displacements = []
-        for dim in range(flow_channels):
-            shape_index = -1 - dim
-            max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index]))
-
-        # mask out all pixels that have larger flow than the max flow allowed
-        max_flow_mask = torch.logical_and(
-            *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)]
-        )
-
-        if valid_mask is not None:
-            valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1)
-        else:
-            valid_mask = max_flow_mask.unsqueeze(1)
-
-        grid = make_coords_grid(B, H, W, device=str(source.device))
-        resampled_grids = grid - flow_pred
-        resampled_grids = resampled_grids.permute(0, 2, 3, 1)
-        resampled_source = grid_sample(reference, resampled_grids, mode="bilinear")
-
-        # compute SSIM loss
-        ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask)
-        l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1))
-        loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss
-
-        return loss.mean()
diff --git a/references/depth/stereo/utils/metrics.py b/references/depth/stereo/utils/metrics.py
deleted file mode 100644
index eaf67822e92..00000000000
--- a/references/depth/stereo/utils/metrics.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import Dict, List, Optional, Tuple
-
-from torch import Tensor
-
-AVAILABLE_METRICS = ["mae", "rmse", "epe", "bad1", "bad2", "epe", "1px", "3px", "5px", "fl-all", "relepe"]
-
-
-def compute_metrics(
-    flow_pred: Tensor, flow_gt: Tensor, valid_flow_mask: Optional[Tensor], metrics: List[str]
-) -> Tuple[Dict[str, float], int]:
-    for m in metrics:
-        if m not in AVAILABLE_METRICS:
-            raise ValueError(f"Invalid metric: {m}. Valid metrics are: {AVAILABLE_METRICS}")
-
-    metrics_dict = {}
-
-    pixels_diffs = (flow_pred - flow_gt).abs()
-    # there is no Y flow in Stereo Matching, therefor flow.abs() = flow.pow(2).sum(dim=1).sqrt()
-    flow_norm = flow_gt.abs()
-
-    if valid_flow_mask is not None:
-        valid_flow_mask = valid_flow_mask.unsqueeze(1)
-        pixels_diffs = pixels_diffs[valid_flow_mask]
-        flow_norm = flow_norm[valid_flow_mask]
-
-    num_pixels = pixels_diffs.numel()
-    if "bad1" in metrics:
-        metrics_dict["bad1"] = (pixels_diffs > 1).float().mean().item()
-    if "bad2" in metrics:
-        metrics_dict["bad2"] = (pixels_diffs > 2).float().mean().item()
-
-    if "mae" in metrics:
-        metrics_dict["mae"] = pixels_diffs.mean().item()
-    if "rmse" in metrics:
-        metrics_dict["rmse"] = pixels_diffs.pow(2).mean().sqrt().item()
-    if "epe" in metrics:
-        metrics_dict["epe"] = pixels_diffs.mean().item()
-    if "1px" in metrics:
-        metrics_dict["1px"] = (pixels_diffs < 1).float().mean().item()
-    if "3px" in metrics:
-        metrics_dict["3px"] = (pixels_diffs < 3).float().mean().item()
-    if "5px" in metrics:
-        metrics_dict["5px"] = (pixels_diffs < 5).float().mean().item()
-    if "fl-all" in metrics:
-        metrics_dict["fl-all"] = ((pixels_diffs < 3) & ((pixels_diffs / flow_norm) < 0.05)).float().mean().item() * 100
-    if "relepe" in metrics:
-        metrics_dict["relepe"] = (pixels_diffs / flow_norm).mean().item()
-
-    return metrics_dict, num_pixels
diff --git a/references/depth/stereo/utils/norm.py b/references/depth/stereo/utils/norm.py
deleted file mode 100644
index 7f6e0011160..00000000000
--- a/references/depth/stereo/utils/norm.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import torch
-
-
-def freeze_batch_norm(model):
-    for m in model.modules():
-        if isinstance(m, torch.nn.BatchNorm2d):
-            m.eval()
-
-
-def unfreeze_batch_norm(model):
-    for m in model.modules():
-        if isinstance(m, torch.nn.BatchNorm2d):
-            m.train()
diff --git a/references/depth/stereo/utils/padder.py b/references/depth/stereo/utils/padder.py
deleted file mode 100644
index 7d2c63afba6..00000000000
--- a/references/depth/stereo/utils/padder.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch.nn.functional as F
-
-
-class InputPadder:
-    """Pads images such that dimensions are divisible by 8"""
-
-    # TODO: Ideally, this should be part of the eval transforms preset, instead
-    # of being part of the validation code. It's not obvious what a good
-    # solution would be, because we need to unpad the predicted flows according
-    # to the input images' size, and in some datasets (Kitti) images can have
-    # variable sizes.
-
-    def __init__(self, dims, mode="sintel"):
-        self.ht, self.wd = dims[-2:]
-        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
-        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
-        if mode == "sintel":
-            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
-        else:
-            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
-
-    def pad(self, *inputs):
-        return [F.pad(x, self._pad, mode="replicate") for x in inputs]
-
-    def unpad(self, x):
-        ht, wd = x.shape[-2:]
-        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
-        return x[..., c[0] : c[1], c[2] : c[3]]
diff --git a/references/depth/stereo/vizualization.py b/references/depth/stereo/vizualization.py
deleted file mode 100644
index d043d274614..00000000000
--- a/references/depth/stereo/vizualization.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import os
-from typing import List
-
-import numpy as np
-import torch
-from torch import Tensor
-from torchvision.utils import make_grid
-
-
-@torch.no_grad()
-def make_disparity_image(disparity: Tensor):
-    # normalize image to [0, 1]
-    disparity = disparity.detach().cpu()
-    disparity = (disparity - disparity.min()) / (disparity.max() - disparity.min())
-    return disparity
-
-
-@torch.no_grad()
-def make_disparity_image_pairs(disparity: Tensor, image: Tensor):
-    disparity = make_disparity_image(disparity)
-    # image is in [-1, 1], bring it to [0, 1]
-    image = image.detach().cpu()
-    image = image * 0.5 + 0.5
-    return disparity, image
-
-
-@torch.no_grad()
-def make_disparity_sequence(disparities: List[Tensor]):
-    # convert each disparity to [0, 1]
-    for idx, disparity_batch in enumerate(disparities):
-        disparities[idx] = torch.stack(list(map(make_disparity_image, disparity_batch)))
-    # make the list into a batch
-    disparity_sequences = torch.stack(disparities)
-    return disparity_sequences
-
-
-@torch.no_grad()
-def make_pair_grid(*inputs, orientation="horizontal"):
-    # make a grid of images with the outputs and references side by side
-    if orientation == "horizontal":
-        # interleave the outputs and references
-        canvas = torch.zeros_like(inputs[0])
-        canvas = torch.cat([canvas] * len(inputs), dim=0)
-        size = len(inputs)
-        for idx, inp in enumerate(inputs):
-            canvas[idx::size, ...] = inp
-        grid = make_grid(canvas, nrow=len(inputs), padding=16, normalize=True, scale_each=True)
-    elif orientation == "vertical":
-        # interleave the outputs and references
-        canvas = torch.cat(inputs, dim=0)
-        size = len(inputs)
-        for idx, inp in enumerate(inputs):
-            canvas[idx::size, ...] = inp
-        grid = make_grid(canvas, nrow=len(inputs[0]), padding=16, normalize=True, scale_each=True)
-    else:
-        raise ValueError("Unknown orientation: {}".format(orientation))
-    return grid
-
-
-@torch.no_grad()
-def make_training_sample_grid(
-    left_images: Tensor,
-    right_images: Tensor,
-    disparities: Tensor,
-    masks: Tensor,
-    predictions: List[Tensor],
-) -> np.ndarray:
-    # detach images and renormalize to [0, 1]
-    images_left = left_images.detach().cpu() * 0.5 + 0.5
-    images_right = right_images.detach().cpu() * 0.5 + 0.5
-    # detach the disparties and predictions
-    disparities = disparities.detach().cpu()
-    predictions = predictions[-1].detach().cpu()
-    # keep only the first channel of pixels, and repeat it 3 times
-    disparities = disparities[:, :1, ...].repeat(1, 3, 1, 1)
-    predictions = predictions[:, :1, ...].repeat(1, 3, 1, 1)
-    # unsqueeze and repeat the masks
-    masks = masks.detach().cpu().unsqueeze(1).repeat(1, 3, 1, 1)
-    # make a grid that will self normalize across the batch
-    pred_grid = make_pair_grid(images_left, images_right, masks, disparities, predictions, orientation="horizontal")
-    pred_grid = pred_grid.permute(1, 2, 0).numpy()
-    pred_grid = (pred_grid * 255).astype(np.uint8)
-    return pred_grid
-
-
-@torch.no_grad()
-def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray:
-    # right most we will be adding the ground truth
-    seq_len = len(predictions) + 1
-    predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))
-    sequence = make_disparity_sequence(predictions)
-    # swap axes to have the in the correct order for each batch sample
-    sequence = torch.swapaxes(sequence, 0, 1).contiguous().reshape(-1, 1, disparities.shape[-2], disparities.shape[-1])
-    sequence = make_grid(sequence, nrow=seq_len, padding=16, normalize=True, scale_each=True)
-    sequence = sequence.permute(1, 2, 0).numpy()
-    sequence = (sequence * 255).astype(np.uint8)
-    return sequence
-
-
-@torch.no_grad()
-def make_prediction_image_side_to_side(
-    predictions: Tensor, disparities: Tensor, valid_mask: Tensor, save_path: str, prefix: str
-) -> None:
-    import matplotlib.pyplot as plt
-
-    # normalize the predictions and disparities in [0, 1]
-    predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min())
-    disparities = (disparities - disparities.min()) / (disparities.max() - disparities.min())
-    predictions = predictions * valid_mask
-    disparities = disparities * valid_mask
-
-    predictions = predictions.detach().cpu()
-    disparities = disparities.detach().cpu()
-
-    for idx, (pred, gt) in enumerate(zip(predictions, disparities)):
-        pred = pred.permute(1, 2, 0).numpy()
-        gt = gt.permute(1, 2, 0).numpy()
-        # plot pred and gt side by side
-        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
-        ax[0].imshow(pred)
-        ax[0].set_title("Prediction")
-        ax[1].imshow(gt)
-        ax[1].set_title("Ground Truth")
-        save_name = os.path.join(save_path, "{}_{}.png".format(prefix, idx))
-        plt.savefig(save_name)
-        plt.close()
diff --git a/setup.py b/setup.py
index 25bef6b50de..951989050b5 100644
--- a/setup.py
+++ b/setup.py
@@ -539,7 +539,7 @@ def run(self):
         license="BSD",
         # Package info
         packages=find_packages(exclude=("test",)),
-        package_data={package_name: ["*.dll", "*.dylib", "*.so", "prototype/datasets/_builtin/*.categories"]},
+        package_data={package_name: ["*.dll", "*.dylib", "*.so"]},
         zip_safe=False,
         install_requires=requirements,
         extras_require={
diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
deleted file mode 100644
index 8c5484a2823..00000000000
--- a/test/builtin_dataset_mocks.py
+++ /dev/null
@@ -1,1568 +0,0 @@
-import bz2
-import collections.abc
-import csv
-import functools
-import gzip
-import io
-import itertools
-import json
-import lzma
-import pathlib
-import pickle
-import random
-import shutil
-import unittest.mock
-import warnings
-import xml.etree.ElementTree as ET
-from collections import Counter, defaultdict
-
-import numpy as np
-import pytest
-import torch
-from datasets_utils import combinations_grid, create_image_file, create_image_folder, make_tar, make_zip
-from torch.nn.functional import one_hot
-from torch.testing import make_tensor as _make_tensor
-from torchvision.prototype import datasets
-
-make_tensor = functools.partial(_make_tensor, device="cpu")
-make_scalar = functools.partial(make_tensor, ())
-
-
-__all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"]
-
-
-class DatasetMock:
-    def __init__(self, name, *, mock_data_fn, configs):
-        # FIXME: error handling for unknown names
-        self.name = name
-        self.mock_data_fn = mock_data_fn
-        self.configs = configs
-
-    def _parse_mock_info(self, mock_info):
-        if mock_info is None:
-            raise pytest.UsageError(
-                f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an "
-                f"integer indicating the number of samples for the current `config`."
-            )
-        elif isinstance(mock_info, int):
-            mock_info = dict(num_samples=mock_info)
-        elif not isinstance(mock_info, dict):
-            raise pytest.UsageError(
-                f"The mock data function for dataset '{self.name}' returned a {type(mock_info)}. The returned object "
-                f"should be a dictionary containing at least the number of samples for the key `'num_samples'`. If no "
-                f"additional information is required for specific tests, the number of samples can also be returned as "
-                f"an integer."
-            )
-        elif "num_samples" not in mock_info:
-            raise pytest.UsageError(
-                f"The dictionary returned by the mock data function for dataset '{self.name}' has to contain a "
-                f"`'num_samples'` entry indicating the number of samples."
-            )
-
-        return mock_info
-
-    def load(self, config):
-        # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
-        # test/test_prototype_builtin_datasets.py
-        root = pathlib.Path(datasets.home()) / self.name
-        # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
-        # this will only download **and** preprocess if the file is not present. In other words, if we already place
-        # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
-        # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
-        # `root` only when it is requested.
-        tmp_mock_data_folder = root / "__mock__"
-        tmp_mock_data_folder.mkdir(parents=True)
-
-        mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
-
-        def patched_download(resource, root, **kwargs):
-            src = tmp_mock_data_folder / resource.file_name
-            if not src.exists():
-                raise pytest.UsageError(
-                    f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
-                    f"but it was not created by the mock data function."
-                )
-
-            dst = root / resource.file_name
-            shutil.move(str(src), str(root))
-
-            return dst
-
-        with unittest.mock.patch(
-            "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
-        ):
-            dataset = datasets.load(self.name, **config)
-
-        extra_files = list(tmp_mock_data_folder.glob("**/*"))
-        if extra_files:
-            raise pytest.UsageError(
-                (
-                    f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
-                    f"but they were not loaded:\n\n"
-                )
-                + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
-            )
-
-        tmp_mock_data_folder.rmdir()
-
-        return dataset, mock_info
-
-
-def config_id(name, config):
-    parts = [name]
-    for name, value in config.items():
-        if isinstance(value, bool):
-            part = ("" if value else "no_") + name
-        else:
-            part = str(value)
-        parts.append(part)
-    return "-".join(parts)
-
-
-def parametrize_dataset_mocks(*dataset_mocks, marks=None):
-    mocks = {}
-    for mock in dataset_mocks:
-        if isinstance(mock, DatasetMock):
-            mocks[mock.name] = mock
-        elif isinstance(mock, collections.abc.Mapping):
-            mocks.update(mock)
-        else:
-            raise pytest.UsageError(
-                f"The positional arguments passed to `parametrize_dataset_mocks` can either be a `DatasetMock`, "
-                f"a sequence of `DatasetMock`'s, or a mapping of names to `DatasetMock`'s, "
-                f"but got {mock} instead."
-            )
-    dataset_mocks = mocks
-
-    if marks is None:
-        marks = {}
-    elif not isinstance(marks, collections.abc.Mapping):
-        raise pytest.UsageError()
-
-    return pytest.mark.parametrize(
-        ("dataset_mock", "config"),
-        [
-            pytest.param(dataset_mock, config, id=config_id(name, config), marks=marks.get(name, ()))
-            for name, dataset_mock in dataset_mocks.items()
-            for config in dataset_mock.configs
-        ],
-    )
-
-
-DATASET_MOCKS = {}
-
-
-def register_mock(name=None, *, configs):
-    def wrapper(mock_data_fn):
-        nonlocal name
-        if name is None:
-            name = mock_data_fn.__name__
-        DATASET_MOCKS[name] = DatasetMock(name, mock_data_fn=mock_data_fn, configs=configs)
-
-        return mock_data_fn
-
-    return wrapper
-
-
-class MNISTMockData:
-    _DTYPES_ID = {
-        torch.uint8: 8,
-        torch.int8: 9,
-        torch.int16: 11,
-        torch.int32: 12,
-        torch.float32: 13,
-        torch.float64: 14,
-    }
-
-    @classmethod
-    def _magic(cls, dtype, ndim):
-        return cls._DTYPES_ID[dtype] * 256 + ndim + 1
-
-    @staticmethod
-    def _encode(t):
-        return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1]
-
-    @staticmethod
-    def _big_endian_dtype(dtype):
-        np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype
-        return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}")
-
-    @classmethod
-    def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high):
-        with compressor(root / filename, "wb") as fh:
-            for meta in (cls._magic(dtype, len(shape)), num_samples, *shape):
-                fh.write(cls._encode(meta))
-
-            data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high)
-
-            fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes())
-
-    @classmethod
-    def generate(
-        cls,
-        root,
-        *,
-        num_categories,
-        num_samples=None,
-        images_file,
-        labels_file,
-        image_size=(28, 28),
-        image_dtype=torch.uint8,
-        label_size=(),
-        label_dtype=torch.uint8,
-        compressor=None,
-    ):
-        if num_samples is None:
-            num_samples = num_categories
-        if compressor is None:
-            compressor = gzip.open
-
-        cls._create_binary_file(
-            root,
-            images_file,
-            num_samples=num_samples,
-            shape=image_size,
-            dtype=image_dtype,
-            compressor=compressor,
-            high=float("inf"),
-        )
-        cls._create_binary_file(
-            root,
-            labels_file,
-            num_samples=num_samples,
-            shape=label_size,
-            dtype=label_dtype,
-            compressor=compressor,
-            high=num_categories,
-        )
-
-        return num_samples
-
-
-def mnist(root, config):
-    prefix = "train" if config["split"] == "train" else "t10k"
-    return MNISTMockData.generate(
-        root,
-        num_categories=10,
-        images_file=f"{prefix}-images-idx3-ubyte.gz",
-        labels_file=f"{prefix}-labels-idx1-ubyte.gz",
-    )
-
-
-DATASET_MOCKS.update(
-    {
-        name: DatasetMock(name, mock_data_fn=mnist, configs=combinations_grid(split=("train", "test")))
-        for name in ["mnist", "fashionmnist", "kmnist"]
-    }
-)
-
-
-@register_mock(
-    configs=combinations_grid(
-        split=("train", "test"),
-        image_set=("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
-    )
-)
-def emnist(root, config):
-    num_samples_map = {}
-    file_names = set()
-    for split, image_set in itertools.product(
-        ("train", "test"),
-        ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
-    ):
-        prefix = f"emnist-{image_set.replace('_', '').lower()}-{split}"
-        images_file = f"{prefix}-images-idx3-ubyte.gz"
-        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
-        file_names.update({images_file, labels_file})
-        num_samples_map[(split, image_set)] = MNISTMockData.generate(
-            root,
-            # The image sets that merge some lower case letters in their respective upper case variant, still use dense
-            # labels in the data files. Thus, num_categories != len(categories) there.
-            num_categories=47 if config["image_set"] in ("Balanced", "By_Merge") else 62,
-            images_file=images_file,
-            labels_file=labels_file,
-        )
-
-    make_zip(root, "emnist-gzip.zip", *file_names)
-
-    return num_samples_map[(config["split"], config["image_set"])]
-
-
-@register_mock(configs=combinations_grid(split=("train", "test", "test10k", "test50k", "nist")))
-def qmnist(root, config):
-    num_categories = 10
-    if config["split"] == "train":
-        num_samples = num_samples_gen = num_categories + 2
-        prefix = "qmnist-train"
-        suffix = ".gz"
-        compressor = gzip.open
-    elif config["split"].startswith("test"):
-        # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create
-        # more than 10000 images for the dataset to not be empty.
-        num_samples_gen = 10001
-        num_samples = {
-            "test": num_samples_gen,
-            "test10k": min(num_samples_gen, 10_000),
-            "test50k": num_samples_gen - 10_000,
-        }[config["split"]]
-        prefix = "qmnist-test"
-        suffix = ".gz"
-        compressor = gzip.open
-    else:  # config["split"] == "nist"
-        num_samples = num_samples_gen = num_categories + 3
-        prefix = "xnist"
-        suffix = ".xz"
-        compressor = lzma.open
-
-    MNISTMockData.generate(
-        root,
-        num_categories=num_categories,
-        num_samples=num_samples_gen,
-        images_file=f"{prefix}-images-idx3-ubyte{suffix}",
-        labels_file=f"{prefix}-labels-idx2-int{suffix}",
-        label_size=(8,),
-        label_dtype=torch.int32,
-        compressor=compressor,
-    )
-    return num_samples
-
-
-class CIFARMockData:
-    NUM_PIXELS = 32 * 32 * 3
-
-    @classmethod
-    def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1):
-        content = {
-            "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(),
-            labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(),
-        }
-        with open(pathlib.Path(root) / name, "wb") as fh:
-            pickle.dump(content, fh)
-
-    @classmethod
-    def generate(
-        cls,
-        root,
-        name,
-        *,
-        folder,
-        train_files,
-        test_files,
-        num_categories,
-        labels_key,
-    ):
-        folder = root / folder
-        folder.mkdir()
-        files = (*train_files, *test_files)
-        for file in files:
-            cls._create_batch_file(
-                folder,
-                file,
-                num_categories=num_categories,
-                labels_key=labels_key,
-            )
-
-        make_tar(root, name, folder, compression="gz")
-
-
-@register_mock(configs=combinations_grid(split=("train", "test")))
-def cifar10(root, config):
-    train_files = [f"data_batch_{idx}" for idx in range(1, 6)]
-    test_files = ["test_batch"]
-
-    CIFARMockData.generate(
-        root=root,
-        name="cifar-10-python.tar.gz",
-        folder=pathlib.Path("cifar-10-batches-py"),
-        train_files=train_files,
-        test_files=test_files,
-        num_categories=10,
-        labels_key="labels",
-    )
-
-    return len(train_files if config["split"] == "train" else test_files)
-
-
-@register_mock(configs=combinations_grid(split=("train", "test")))
-def cifar100(root, config):
-    train_files = ["train"]
-    test_files = ["test"]
-
-    CIFARMockData.generate(
-        root=root,
-        name="cifar-100-python.tar.gz",
-        folder=pathlib.Path("cifar-100-python"),
-        train_files=train_files,
-        test_files=test_files,
-        num_categories=100,
-        labels_key="fine_labels",
-    )
-
-    return len(train_files if config["split"] == "train" else test_files)
-
-
-@register_mock(configs=[dict()])
-def caltech101(root, config):
-    def create_ann_file(root, name):
-        import scipy.io
-
-        box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16)
-        obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy()
-
-        scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour))
-
-    def create_ann_folder(root, name, file_name_fn, num_examples):
-        root = pathlib.Path(root) / name
-        root.mkdir(parents=True)
-
-        for idx in range(num_examples):
-            create_ann_file(root, file_name_fn(idx))
-
-    images_root = root / "101_ObjectCategories"
-    anns_root = root / "Annotations"
-
-    image_category_map = {
-        "Faces": "Faces_2",
-        "Faces_easy": "Faces_3",
-        "Motorbikes": "Motorbikes_16",
-        "airplanes": "Airplanes_Side_2",
-    }
-
-    categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"]
-
-    num_images_per_category = 2
-    for category in categories:
-        create_image_folder(
-            root=images_root,
-            name=category,
-            file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
-            num_examples=num_images_per_category,
-        )
-        create_ann_folder(
-            root=anns_root,
-            name=image_category_map.get(category, category),
-            file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
-            num_examples=num_images_per_category,
-        )
-
-    (images_root / "BACKGROUND_Goodle").mkdir()
-    make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")
-
-    make_tar(root, f"{anns_root.name}.tar", anns_root)
-
-    return num_images_per_category * len(categories)
-
-
-@register_mock(configs=[dict()])
-def caltech256(root, config):
-    dir = root / "256_ObjectCategories"
-    num_images_per_category = 2
-
-    categories = [
-        (1, "ak47"),
-        (127, "laptop-101"),
-        (198, "spider"),
-        (257, "clutter"),
-    ]
-
-    for category_idx, category in categories:
-        files = create_image_folder(
-            dir,
-            name=f"{category_idx:03d}.{category}",
-            file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg",
-            num_examples=num_images_per_category,
-        )
-        if category == "spider":
-            open(files[0].parent / "RENAME2", "w").close()
-
-    make_tar(root, f"{dir.name}.tar", dir)
-
-    return num_images_per_category * len(categories)
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "test")))
-def imagenet(root, config):
-    from scipy.io import savemat
-
-    info = datasets.info("imagenet")
-
-    if config["split"] == "train":
-        num_samples = len(info["wnids"])
-        archive_name = "ILSVRC2012_img_train.tar"
-
-        files = []
-        for wnid in info["wnids"]:
-            create_image_folder(
-                root=root,
-                name=wnid,
-                file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
-                num_examples=1,
-            )
-            files.append(make_tar(root, f"{wnid}.tar"))
-    elif config["split"] == "val":
-        num_samples = 3
-        archive_name = "ILSVRC2012_img_val.tar"
-        files = [create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
-
-        devkit_root = root / "ILSVRC2012_devkit_t12"
-        data_root = devkit_root / "data"
-        data_root.mkdir(parents=True)
-
-        with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file:
-            for label in torch.randint(0, len(info["wnids"]), (num_samples,)).tolist():
-                file.write(f"{label}\n")
-
-        num_children = 0
-        synsets = [
-            (idx, wnid, category, "", num_children, [], 0, 0)
-            for idx, (category, wnid) in enumerate(zip(info["categories"], info["wnids"]), 1)
-        ]
-        num_children = 1
-        synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
-        with warnings.catch_warnings():
-            # The warning is not for savemat, but rather for some internals savemet is using
-            warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
-            savemat(data_root / "meta.mat", dict(synsets=synsets))
-
-        make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
-    else:  # config["split"] == "test"
-        num_samples = 5
-        archive_name = "ILSVRC2012_img_test_v10102019.tar"
-        files = [create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
-
-    make_tar(root, archive_name, *files)
-
-    return num_samples
-
-
-class CocoMockData:
-    @classmethod
-    def _make_annotations_json(
-        cls,
-        root,
-        name,
-        *,
-        images_meta,
-        fn,
-    ):
-        num_anns_per_image = torch.randint(1, 5, (len(images_meta),))
-        num_anns_total = int(num_anns_per_image.sum())
-        ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)])
-
-        anns_meta = []
-        for image_meta, num_anns in zip(images_meta, num_anns_per_image):
-            for _ in range(num_anns):
-                ann_id = int(next(ann_ids_iter))
-                anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"]))
-        anns_meta.sort(key=lambda ann: ann["id"])
-
-        with open(root / name, "w") as file:
-            json.dump(dict(images=images_meta, annotations=anns_meta), file)
-
-        return num_anns_per_image
-
-    @staticmethod
-    def _make_instances_data(ann_id, image_meta):
-        def make_rle_segmentation():
-            height, width = image_meta["height"], image_meta["width"]
-            numel = height * width
-            counts = []
-            while sum(counts) <= numel:
-                counts.append(int(torch.randint(5, 8, ())))
-            if sum(counts) > numel:
-                counts[-1] -= sum(counts) - numel
-            return dict(counts=counts, size=[height, width])
-
-        return dict(
-            segmentation=make_rle_segmentation(),
-            bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(),
-            iscrowd=True,
-            area=float(make_scalar(dtype=torch.float32)),
-            category_id=int(make_scalar(dtype=torch.int64)),
-        )
-
-    @staticmethod
-    def _make_captions_data(ann_id, image_meta):
-        return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.")
-
-    @classmethod
-    def _make_annotations(cls, root, name, *, images_meta):
-        num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64)
-        for annotations, fn in (
-            ("instances", cls._make_instances_data),
-            ("captions", cls._make_captions_data),
-        ):
-            num_anns_per_image += cls._make_annotations_json(
-                root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn
-            )
-
-        return int(num_anns_per_image.sum())
-
-    @classmethod
-    def generate(
-        cls,
-        root,
-        *,
-        split,
-        year,
-        num_samples,
-    ):
-        annotations_dir = root / "annotations"
-        annotations_dir.mkdir()
-
-        for split_ in ("train", "val"):
-            config_name = f"{split_}{year}"
-
-            images_meta = [
-                dict(
-                    file_name=f"{idx:012d}.jpg",
-                    id=idx,
-                    width=width,
-                    height=height,
-                )
-                for idx, (height, width) in enumerate(
-                    torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
-                )
-            ]
-
-            if split_ == split:
-                create_image_folder(
-                    root,
-                    config_name,
-                    file_name_fn=lambda idx: images_meta[idx]["file_name"],
-                    num_examples=num_samples,
-                    size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
-                )
-                make_zip(root, f"{config_name}.zip")
-
-            cls._make_annotations(
-                annotations_dir,
-                config_name,
-                images_meta=images_meta,
-            )
-
-        make_zip(root, f"annotations_trainval{year}.zip", annotations_dir)
-
-        return num_samples
-
-
-@register_mock(
-    configs=combinations_grid(
-        split=("train", "val"),
-        year=("2017", "2014"),
-        annotations=("instances", "captions", None),
-    )
-)
-def coco(root, config):
-    return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
-
-
-class SBDMockData:
-    _NUM_CATEGORIES = 20
-
-    @classmethod
-    def _make_split_files(cls, root_map):
-        ids_map = {
-            split: [f"2008_{idx:06d}" for idx in idcs]
-            for split, idcs in (
-                ("train", [0, 1, 2]),
-                ("train_noval", [0, 2]),
-                ("val", [3]),
-            )
-        }
-
-        for split, ids in ids_map.items():
-            with open(root_map[split] / f"{split}.txt", "w") as fh:
-                fh.writelines(f"{id}\n" for id in ids)
-
-        return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
-
-    @classmethod
-    def _make_anns_folder(cls, root, name, ids):
-        from scipy.io import savemat
-
-        anns_folder = root / name
-        anns_folder.mkdir()
-
-        sizes = torch.randint(1, 9, size=(len(ids), 2)).tolist()
-        for id, size in zip(ids, sizes):
-            savemat(
-                anns_folder / f"{id}.mat",
-                {
-                    "GTcls": {
-                        "Boundaries": cls._make_boundaries(size),
-                        "Segmentation": cls._make_segmentation(size),
-                    }
-                },
-            )
-        return sizes
-
-    @classmethod
-    def _make_boundaries(cls, size):
-        from scipy.sparse import csc_matrix
-
-        return [
-            [csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())] for _ in range(cls._NUM_CATEGORIES)
-        ]
-
-    @classmethod
-    def _make_segmentation(cls, size):
-        return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
-
-    @classmethod
-    def generate(cls, root):
-        archive_folder = root / "benchmark_RELEASE"
-        dataset_folder = archive_folder / "dataset"
-        dataset_folder.mkdir(parents=True, exist_ok=True)
-
-        ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root}))
-        sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
-        create_image_folder(
-            dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
-        )
-
-        make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
-
-        return num_samples_map
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
-def sbd(root, config):
-    return SBDMockData.generate(root)[config["split"]]
-
-
-@register_mock(configs=[dict()])
-def semeion(root, config):
-    num_samples = 3
-    num_categories = 10
-
-    images = torch.rand(num_samples, 256)
-    labels = one_hot(torch.randint(num_categories, size=(num_samples,)), num_classes=num_categories)
-    with open(root / "semeion.data", "w") as fh:
-        for image, one_hot_label in zip(images, labels):
-            image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
-            labels_columns = " ".join([str(label.item()) for label in one_hot_label])
-            fh.write(f"{image_columns} {labels_columns} \n")
-
-    return num_samples
-
-
-class VOCMockData:
-    _TRAIN_VAL_FILE_NAMES = {
-        "2007": "VOCtrainval_06-Nov-2007.tar",
-        "2008": "VOCtrainval_14-Jul-2008.tar",
-        "2009": "VOCtrainval_11-May-2009.tar",
-        "2010": "VOCtrainval_03-May-2010.tar",
-        "2011": "VOCtrainval_25-May-2011.tar",
-        "2012": "VOCtrainval_11-May-2012.tar",
-    }
-    _TEST_FILE_NAMES = {
-        "2007": "VOCtest_06-Nov-2007.tar",
-    }
-
-    @classmethod
-    def _make_split_files(cls, root, *, year, trainval):
-        split_folder = root / "ImageSets"
-
-        if trainval:
-            idcs_map = {
-                "train": [0, 1, 2],
-                "val": [3, 4],
-            }
-            idcs_map["trainval"] = [*idcs_map["train"], *idcs_map["val"]]
-        else:
-            idcs_map = {
-                "test": [5],
-            }
-        ids_map = {split: [f"{year}_{idx:06d}" for idx in idcs] for split, idcs in idcs_map.items()}
-
-        for task_sub_folder in ("Main", "Segmentation"):
-            task_folder = split_folder / task_sub_folder
-            task_folder.mkdir(parents=True, exist_ok=True)
-            for split, ids in ids_map.items():
-                with open(task_folder / f"{split}.txt", "w") as fh:
-                    fh.writelines(f"{id}\n" for id in ids)
-
-        return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
-
-    @classmethod
-    def _make_detection_anns_folder(cls, root, name, *, file_name_fn, num_examples):
-        folder = root / name
-        folder.mkdir(parents=True, exist_ok=True)
-
-        for idx in range(num_examples):
-            cls._make_detection_ann_file(folder, file_name_fn(idx))
-
-    @classmethod
-    def _make_detection_ann_file(cls, root, name):
-        def add_child(parent, name, text=None):
-            child = ET.SubElement(parent, name)
-            child.text = str(text)
-            return child
-
-        def add_name(obj, name="dog"):
-            add_child(obj, "name", name)
-
-        def add_size(obj):
-            obj = add_child(obj, "size")
-            size = {"width": 0, "height": 0, "depth": 3}
-            for name, text in size.items():
-                add_child(obj, name, text)
-
-        def add_bndbox(obj):
-            obj = add_child(obj, "bndbox")
-            bndbox = {"xmin": 1, "xmax": 2, "ymin": 3, "ymax": 4}
-            for name, text in bndbox.items():
-                add_child(obj, name, text)
-
-        annotation = ET.Element("annotation")
-        add_size(annotation)
-        obj = add_child(annotation, "object")
-        add_name(obj)
-        add_bndbox(obj)
-
-        with open(root / name, "wb") as fh:
-            fh.write(ET.tostring(annotation))
-
-    @classmethod
-    def generate(cls, root, *, year, trainval):
-        archive_folder = root
-        if year == "2011":
-            archive_folder = root / "TrainVal"
-            data_folder = archive_folder / "VOCdevkit"
-        else:
-            archive_folder = data_folder = root / "VOCdevkit"
-        data_folder = data_folder / f"VOC{year}"
-        data_folder.mkdir(parents=True, exist_ok=True)
-
-        ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
-        for make_folder_fn, name, suffix in [
-            (create_image_folder, "JPEGImages", ".jpg"),
-            (create_image_folder, "SegmentationClass", ".png"),
-            (cls._make_detection_anns_folder, "Annotations", ".xml"),
-        ]:
-            make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
-        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
-
-        return num_samples_map
-
-
-@register_mock(
-    configs=[
-        *combinations_grid(
-            split=("train", "val", "trainval"),
-            year=("2007", "2008", "2009", "2010", "2011", "2012"),
-            task=("detection", "segmentation"),
-        ),
-        *combinations_grid(
-            split=("test",),
-            year=("2007",),
-            task=("detection", "segmentation"),
-        ),
-    ],
-)
-def voc(root, config):
-    trainval = config["split"] != "test"
-    return VOCMockData.generate(root, year=config["year"], trainval=trainval)[config["split"]]
-
-
-class CelebAMockData:
-    @classmethod
-    def _make_ann_file(cls, root, name, data, *, field_names=None):
-        with open(root / name, "w") as file:
-            if field_names:
-                file.write(f"{len(data)}\r\n")
-                file.write(" ".join(field_names) + "\r\n")
-            file.writelines(" ".join(str(item) for item in row) + "\r\n" for row in data)
-
-    _SPLIT_TO_IDX = {
-        "train": 0,
-        "val": 1,
-        "test": 2,
-    }
-
-    @classmethod
-    def _make_split_file(cls, root):
-        num_samples_map = {"train": 4, "val": 3, "test": 2}
-
-        data = [
-            (f"{idx:06d}.jpg", cls._SPLIT_TO_IDX[split])
-            for split, num_samples in num_samples_map.items()
-            for idx in range(num_samples)
-        ]
-        cls._make_ann_file(root, "list_eval_partition.txt", data)
-
-        image_file_names, _ = zip(*data)
-        return image_file_names, num_samples_map
-
-    @classmethod
-    def _make_identity_file(cls, root, image_file_names):
-        cls._make_ann_file(
-            root, "identity_CelebA.txt", [(name, int(make_scalar(low=1, dtype=torch.int))) for name in image_file_names]
-        )
-
-    @classmethod
-    def _make_attributes_file(cls, root, image_file_names):
-        field_names = ("5_o_Clock_Shadow", "Young")
-        data = [
-            [name, *[" 1" if attr else "-1" for attr in make_tensor((len(field_names),), dtype=torch.bool)]]
-            for name in image_file_names
-        ]
-        cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, ""))
-
-    @classmethod
-    def _make_bounding_boxes_file(cls, root, image_file_names):
-        field_names = ("image_id", "x_1", "y_1", "width", "height")
-        data = [
-            [f"{name}  ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]]
-            for name in image_file_names
-        ]
-        cls._make_ann_file(root, "list_bbox_celeba.txt", data, field_names=field_names)
-
-    @classmethod
-    def _make_landmarks_file(cls, root, image_file_names):
-        field_names = ("lefteye_x", "lefteye_y", "rightmouth_x", "rightmouth_y")
-        data = [
-            [
-                name,
-                *[
-                    f"{coord:4d}" if idx else coord
-                    for idx, coord in enumerate(make_tensor((len(field_names),), low=0, dtype=torch.int).tolist())
-                ],
-            ]
-            for name in image_file_names
-        ]
-        cls._make_ann_file(root, "list_landmarks_align_celeba.txt", data, field_names=field_names)
-
-    @classmethod
-    def generate(cls, root):
-        image_file_names, num_samples_map = cls._make_split_file(root)
-
-        image_files = create_image_folder(
-            root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names)
-        )
-        make_zip(root, image_files[0].parent.with_suffix(".zip").name)
-
-        for make_ann_file_fn in (
-            cls._make_identity_file,
-            cls._make_attributes_file,
-            cls._make_bounding_boxes_file,
-            cls._make_landmarks_file,
-        ):
-            make_ann_file_fn(root, image_file_names)
-
-        return num_samples_map
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "test")))
-def celeba(root, config):
-    return CelebAMockData.generate(root)[config["split"]]
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "test")))
-def country211(root, config):
-    split_folder = pathlib.Path(root, "country211", "valid" if config["split"] == "val" else config["split"])
-    split_folder.mkdir(parents=True, exist_ok=True)
-
-    num_examples = {
-        "train": 3,
-        "val": 4,
-        "test": 5,
-    }[config["split"]]
-
-    classes = ("AD", "BS", "GR")
-    for cls in classes:
-        create_image_folder(
-            split_folder,
-            name=cls,
-            file_name_fn=lambda idx: f"{idx}.jpg",
-            num_examples=num_examples,
-        )
-    make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
-    return num_examples * len(classes)
-
-
-@register_mock(configs=combinations_grid(split=("train", "test")))
-def food101(root, config):
-    data_folder = root / "food-101"
-
-    num_images_per_class = 3
-    image_folder = data_folder / "images"
-    categories = ["apple_pie", "baby_back_ribs", "waffles"]
-    image_ids = []
-    for category in categories:
-        image_files = create_image_folder(
-            image_folder,
-            category,
-            file_name_fn=lambda idx: f"{idx:04d}.jpg",
-            num_examples=num_images_per_class,
-        )
-        image_ids.extend(path.relative_to(path.parents[1]).with_suffix("").as_posix() for path in image_files)
-
-    meta_folder = data_folder / "meta"
-    meta_folder.mkdir()
-
-    with open(meta_folder / "classes.txt", "w") as file:
-        for category in categories:
-            file.write(f"{category}\n")
-
-    splits = ["train", "test"]
-    num_samples_map = {}
-    for offset, split in enumerate(splits):
-        image_ids_in_split = image_ids[offset :: len(splits)]
-        num_samples_map[split] = len(image_ids_in_split)
-        with open(meta_folder / f"{split}.txt", "w") as file:
-            for image_id in image_ids_in_split:
-                file.write(f"{image_id}\n")
-
-    make_tar(root, f"{data_folder.name}.tar.gz", compression="gz")
-
-    return num_samples_map[config["split"]]
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "test"), fold=(1, 4, 10)))
-def dtd(root, config):
-    data_folder = root / "dtd"
-
-    num_images_per_class = 3
-    image_folder = data_folder / "images"
-    categories = {"banded", "marbled", "zigzagged"}
-    image_ids_per_category = {
-        category: [
-            str(path.relative_to(path.parents[1]).as_posix())
-            for path in create_image_folder(
-                image_folder,
-                category,
-                file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg",
-                num_examples=num_images_per_class,
-            )
-        ]
-        for category in categories
-    }
-
-    meta_folder = data_folder / "labels"
-    meta_folder.mkdir()
-
-    with open(meta_folder / "labels_joint_anno.txt", "w") as file:
-        for cls, image_ids in image_ids_per_category.items():
-            for image_id in image_ids:
-                joint_categories = random.choices(
-                    list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))
-                )
-                file.write(" ".join([image_id, *sorted([cls, *joint_categories])]) + "\n")
-
-    image_ids = list(itertools.chain(*image_ids_per_category.values()))
-    splits = ("train", "val", "test")
-    num_samples_map = {}
-    for fold in range(1, 11):
-        random.shuffle(image_ids)
-        for offset, split in enumerate(splits):
-            image_ids_in_config = image_ids[offset :: len(splits)]
-            with open(meta_folder / f"{split}{fold}.txt", "w") as file:
-                file.write("\n".join(image_ids_in_config) + "\n")
-
-            num_samples_map[(split, fold)] = len(image_ids_in_config)
-
-    make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz")
-
-    return num_samples_map[config["split"], config["fold"]]
-
-
-@register_mock(configs=combinations_grid(split=("train", "test")))
-def fer2013(root, config):
-    split = config["split"]
-    num_samples = 5 if split == "train" else 3
-
-    path = root / f"{split}.csv"
-    with open(path, "w", newline="") as file:
-        field_names = ["emotion"] if split == "train" else []
-        field_names.append("pixels")
-
-        file.write(",".join(field_names) + "\n")
-
-        writer = csv.DictWriter(file, fieldnames=field_names, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-        for _ in range(num_samples):
-            rowdict = {
-                "pixels": " ".join([str(int(pixel)) for pixel in torch.randint(256, (48 * 48,), dtype=torch.uint8)])
-            }
-            if split == "train":
-                rowdict["emotion"] = int(torch.randint(7, ()))
-            writer.writerow(rowdict)
-
-    make_zip(root, f"{path.name}.zip", path)
-
-    return num_samples
-
-
-@register_mock(configs=combinations_grid(split=("train", "test")))
-def gtsrb(root, config):
-    num_examples_per_class = 5 if config["split"] == "train" else 3
-    classes = ("00000", "00042", "00012")
-    num_examples = num_examples_per_class * len(classes)
-
-    csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
-
-    def _make_ann_file(path, num_examples, class_idx):
-        if class_idx == "random":
-            class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
-
-        with open(path, "w") as csv_file:
-            writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
-            writer.writeheader()
-            for image_idx in range(num_examples):
-                writer.writerow(
-                    {
-                        "Filename": f"{image_idx:05d}.ppm",
-                        "Width": torch.randint(1, 100, size=()).item(),
-                        "Height": torch.randint(1, 100, size=()).item(),
-                        "Roi.X1": torch.randint(1, 100, size=()).item(),
-                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
-                        "Roi.X2": torch.randint(1, 100, size=()).item(),
-                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
-                        "ClassId": class_idx,
-                    }
-                )
-
-    archive_folder = root / "GTSRB"
-
-    if config["split"] == "train":
-        train_folder = archive_folder / "Training"
-        train_folder.mkdir(parents=True)
-
-        for class_idx in classes:
-            create_image_folder(
-                train_folder,
-                name=class_idx,
-                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
-                num_examples=num_examples_per_class,
-            )
-            _make_ann_file(
-                path=train_folder / class_idx / f"GT-{class_idx}.csv",
-                num_examples=num_examples_per_class,
-                class_idx=int(class_idx),
-            )
-        make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
-    else:
-        test_folder = archive_folder / "Final_Test"
-        test_folder.mkdir(parents=True)
-
-        create_image_folder(
-            test_folder,
-            name="Images",
-            file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
-            num_examples=num_examples,
-        )
-
-        make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
-
-        _make_ann_file(
-            path=root / "GT-final_test.csv",
-            num_examples=num_examples,
-            class_idx="random",
-        )
-
-        make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
-
-    return num_examples
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "test")))
-def clevr(root, config):
-    data_folder = root / "CLEVR_v1.0"
-
-    num_samples_map = {
-        "train": 3,
-        "val": 2,
-        "test": 1,
-    }
-
-    images_folder = data_folder / "images"
-    image_files = {
-        split: create_image_folder(
-            images_folder,
-            split,
-            file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg",
-            num_examples=num_samples,
-        )
-        for split, num_samples in num_samples_map.items()
-    }
-
-    scenes_folder = data_folder / "scenes"
-    scenes_folder.mkdir()
-    for split in ["train", "val"]:
-        with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file:
-            json.dump(
-                {
-                    "scenes": [
-                        {
-                            "image_filename": image_file.name,
-                            # We currently only return the number of objects in a scene.
-                            # Thus, it is sufficient for now to only mock the number of elements.
-                            "objects": [None] * int(torch.randint(1, 5, ())),
-                        }
-                        for image_file in image_files[split]
-                    ]
-                },
-                file,
-            )
-
-    make_zip(root, f"{data_folder.name}.zip", data_folder)
-
-    return num_samples_map[config["split"]]
-
-
-class OxfordIIITPetMockData:
-    @classmethod
-    def _meta_to_split_and_classification_ann(cls, meta, idx):
-        image_id = "_".join(
-            [
-                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
-                str(idx),
-            ]
-        )
-        class_id = str(meta["label"] + 1)
-        species = "1" if meta["species"] == "cat" else "2"
-        breed_id = "-1"
-        return (image_id, class_id, species, breed_id)
-
-    @classmethod
-    def generate(self, root):
-        classification_anns_meta = (
-            dict(cls="Abyssinian", label=0, species="cat"),
-            dict(cls="Keeshond", label=18, species="dog"),
-            dict(cls="Yorkshire Terrier", label=36, species="dog"),
-        )
-        split_and_classification_anns = [
-            self._meta_to_split_and_classification_ann(meta, idx)
-            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
-        ]
-        image_ids, *_ = zip(*split_and_classification_anns)
-
-        image_files = create_image_folder(
-            root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
-        )
-
-        anns_folder = root / "annotations"
-        anns_folder.mkdir()
-        random.shuffle(split_and_classification_anns)
-        splits = ("trainval", "test")
-        num_samples_map = {}
-        for offset, split in enumerate(splits):
-            split_and_classification_anns_in_split = split_and_classification_anns[offset :: len(splits)]
-            with open(anns_folder / f"{split}.txt", "w") as file:
-                writer = csv.writer(file, delimiter=" ")
-                for split_and_classification_ann in split_and_classification_anns_in_split:
-                    writer.writerow(split_and_classification_ann)
-
-            num_samples_map[split] = len(split_and_classification_anns_in_split)
-
-        segmentation_files = create_image_folder(
-            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
-        )
-
-        # The dataset has some rogue files
-        for path in image_files[:3]:
-            path.with_suffix(".mat").touch()
-        for path in segmentation_files:
-            path.with_name(f".{path.name}").touch()
-
-        make_tar(root, "images.tar.gz", compression="gz")
-        make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz")
-
-        return num_samples_map
-
-
-@register_mock(name="oxford-iiit-pet", configs=combinations_grid(split=("trainval", "test")))
-def oxford_iiit_pet(root, config):
-    return OxfordIIITPetMockData.generate(root)[config["split"]]
-
-
-class _CUB200MockData:
-    @classmethod
-    def _category_folder(cls, category, idx):
-        return f"{idx:03d}.{category}"
-
-    @classmethod
-    def _file_stem(cls, category, idx):
-        return f"{category}_{idx:04d}"
-
-    @classmethod
-    def _make_images(cls, images_folder):
-        image_files = []
-        for category_idx, category in [
-            (1, "Black_footed_Albatross"),
-            (100, "Brown_Pelican"),
-            (200, "Common_Yellowthroat"),
-        ]:
-            image_files.extend(
-                create_image_folder(
-                    images_folder,
-                    cls._category_folder(category, category_idx),
-                    lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg",
-                    num_examples=5,
-                )
-            )
-
-        return image_files
-
-
-class CUB2002011MockData(_CUB200MockData):
-    @classmethod
-    def _make_archive(cls, root):
-        archive_folder = root / "CUB_200_2011"
-
-        images_folder = archive_folder / "images"
-        image_files = cls._make_images(images_folder)
-        image_ids = list(range(1, len(image_files) + 1))
-
-        with open(archive_folder / "images.txt", "w") as file:
-            file.write(
-                "\n".join(
-                    f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files)
-                )
-            )
-
-        split_ids = torch.randint(2, (len(image_ids),)).tolist()
-        counts = Counter(split_ids)
-        num_samples_map = {"train": counts[1], "test": counts[0]}
-        with open(archive_folder / "train_test_split.txt", "w") as file:
-            file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids)))
-
-        with open(archive_folder / "bounding_boxes.txt", "w") as file:
-            file.write(
-                "\n".join(
-                    " ".join(
-                        str(item)
-                        for item in [image_id, *make_tensor((4,), dtype=torch.int, low=0).to(torch.float).tolist()]
-                    )
-                    for image_id in image_ids
-                )
-            )
-
-        make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz")
-
-        return image_files, num_samples_map
-
-    @classmethod
-    def _make_segmentations(cls, root, image_files):
-        segmentations_folder = root / "segmentations"
-        for image_file in image_files:
-            folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1]))
-            folder.mkdir(exist_ok=True, parents=True)
-            create_image_file(
-                folder,
-                image_file.with_suffix(".png").name,
-                size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()],
-            )
-
-        make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz")
-
-    @classmethod
-    def generate(cls, root):
-        image_files, num_samples_map = cls._make_archive(root)
-        cls._make_segmentations(root, image_files)
-        return num_samples_map
-
-
-class CUB2002010MockData(_CUB200MockData):
-    @classmethod
-    def _make_hidden_rouge_file(cls, *files):
-        for file in files:
-            (file.parent / f"._{file.name}").touch()
-
-    @classmethod
-    def _make_splits(cls, root, image_files):
-        split_folder = root / "lists"
-        split_folder.mkdir()
-        random.shuffle(image_files)
-        splits = ("train", "test")
-        num_samples_map = {}
-        for offset, split in enumerate(splits):
-            image_files_in_split = image_files[offset :: len(splits)]
-
-            split_file = split_folder / f"{split}.txt"
-            with open(split_file, "w") as file:
-                file.write(
-                    "\n".join(
-                        sorted(
-                            str(image_file.relative_to(image_file.parents[1]).as_posix())
-                            for image_file in image_files_in_split
-                        )
-                    )
-                )
-
-            cls._make_hidden_rouge_file(split_file)
-            num_samples_map[split] = len(image_files_in_split)
-
-        make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz")
-
-        return num_samples_map
-
-    @classmethod
-    def _make_anns(cls, root, image_files):
-        from scipy.io import savemat
-
-        anns_folder = root / "annotations-mat"
-        for image_file in image_files:
-            ann_file = anns_folder / image_file.with_suffix(".mat").relative_to(image_file.parents[1])
-            ann_file.parent.mkdir(parents=True, exist_ok=True)
-
-            savemat(
-                ann_file,
-                {
-                    "seg": torch.randint(
-                        256, make_tensor((2,), low=3, dtype=torch.int).tolist(), dtype=torch.uint8
-                    ).numpy(),
-                    "bbox": dict(
-                        zip(("left", "top", "right", "bottom"), make_tensor((4,), dtype=torch.uint8).tolist())
-                    ),
-                },
-            )
-
-        readme_file = anns_folder / "README.txt"
-        readme_file.touch()
-        cls._make_hidden_rouge_file(readme_file)
-
-        make_tar(root, "annotations.tgz", anns_folder, compression="gz")
-
-    @classmethod
-    def generate(cls, root):
-        images_folder = root / "images"
-        image_files = cls._make_images(images_folder)
-        cls._make_hidden_rouge_file(*image_files)
-        make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz")
-
-        num_samples_map = cls._make_splits(root, image_files)
-        cls._make_anns(root, image_files)
-
-        return num_samples_map
-
-
-@register_mock(configs=combinations_grid(split=("train", "test"), year=("2010", "2011")))
-def cub200(root, config):
-    num_samples_map = (CUB2002011MockData if config["year"] == "2011" else CUB2002010MockData).generate(root)
-    return num_samples_map[config["split"]]
-
-
-@register_mock(configs=[dict()])
-def eurosat(root, config):
-    data_folder = root / "2750"
-    data_folder.mkdir(parents=True)
-
-    num_examples_per_class = 3
-    categories = ["AnnualCrop", "Forest"]
-    for category in categories:
-        create_image_folder(
-            root=data_folder,
-            name=category,
-            file_name_fn=lambda idx: f"{category}_{idx + 1}.jpg",
-            num_examples=num_examples_per_class,
-        )
-    make_zip(root, "EuroSAT.zip", data_folder)
-    return len(categories) * num_examples_per_class
-
-
-@register_mock(configs=combinations_grid(split=("train", "test", "extra")))
-def svhn(root, config):
-    import scipy.io as sio
-
-    num_samples = {
-        "train": 2,
-        "test": 3,
-        "extra": 4,
-    }[config["split"]]
-
-    sio.savemat(
-        root / f"{config['split']}_32x32.mat",
-        {
-            "X": np.random.randint(256, size=(32, 32, 3, num_samples), dtype=np.uint8),
-            "y": np.random.randint(10, size=(num_samples,), dtype=np.uint8),
-        },
-    )
-    return num_samples
-
-
-@register_mock(configs=combinations_grid(split=("train", "val", "test")))
-def pcam(root, config):
-    import h5py
-
-    num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
-
-    split = "valid" if config["split"] == "val" else config["split"]
-
-    images_io = io.BytesIO()
-    with h5py.File(images_io, "w") as f:
-        f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
-
-    targets_io = io.BytesIO()
-    with h5py.File(targets_io, "w") as f:
-        f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
-
-    # Create .gz compressed files
-    images_file = root / f"camelyonpatch_level_2_split_{split}_x.h5.gz"
-    targets_file = root / f"camelyonpatch_level_2_split_{split}_y.h5.gz"
-    for compressed_file_name, uncompressed_file_io in ((images_file, images_io), (targets_file, targets_io)):
-        compressed_data = gzip.compress(uncompressed_file_io.getbuffer())
-        with open(compressed_file_name, "wb") as compressed_file:
-            compressed_file.write(compressed_data)
-
-    return num_images
-
-
-@register_mock(name="stanford-cars", configs=combinations_grid(split=("train", "test")))
-def stanford_cars(root, config):
-    import scipy.io as io
-    from numpy.core.records import fromarrays
-
-    split = config["split"]
-    num_samples = {"train": 5, "test": 7}[split]
-    num_categories = 3
-
-    if split == "train":
-        images_folder_name = "cars_train"
-        devkit = root / "devkit"
-        devkit.mkdir()
-        annotations_mat_path = devkit / "cars_train_annos.mat"
-    else:
-        images_folder_name = "cars_test"
-        annotations_mat_path = root / "cars_test_annos_withlabels.mat"
-
-    create_image_folder(
-        root=root,
-        name=images_folder_name,
-        file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
-        num_examples=num_samples,
-    )
-
-    make_tar(root, f"cars_{split}.tgz", images_folder_name)
-    bbox = np.random.randint(1, 200, num_samples, dtype=np.uint8)
-    classes = np.random.randint(1, num_categories + 1, num_samples, dtype=np.uint8)
-    fnames = [f"{i:5d}.jpg" for i in range(num_samples)]
-    rec_array = fromarrays(
-        [bbox, bbox, bbox, bbox, classes, fnames],
-        names=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "class", "fname"],
-    )
-
-    io.savemat(annotations_mat_path, {"annotations": rec_array})
-    if split == "train":
-        make_tar(root, "car_devkit.tgz", devkit, compression="gz")
-
-    return num_samples
-
-
-@register_mock(configs=combinations_grid(split=("train", "test")))
-def usps(root, config):
-    num_samples = {"train": 15, "test": 7}[config["split"]]
-
-    with bz2.open(root / f"usps{'.t' if not config['split'] == 'train' else ''}.bz2", "wb") as fh:
-        lines = []
-        for _ in range(num_samples):
-            label = make_tensor(1, low=1, high=11, dtype=torch.int)
-            values = make_tensor(256, low=-1, high=1, dtype=torch.float)
-            lines.append(
-                " ".join([f"{int(label)}", *(f"{idx}:{float(value):.6f}" for idx, value in enumerate(values, 1))])
-            )
-
-        fh.write("\n".join(lines).encode())
-
-    return num_samples
diff --git a/test/expect/ModelTester.test_crestereo_base_expect.pkl b/test/expect/ModelTester.test_crestereo_base_expect.pkl
deleted file mode 100644
index e5b8cd8f666..00000000000
Binary files a/test/expect/ModelTester.test_crestereo_base_expect.pkl and /dev/null differ
diff --git a/test/expect/ModelTester.test_raft_stereo_base_expect.pkl b/test/expect/ModelTester.test_raft_stereo_base_expect.pkl
deleted file mode 100644
index 550d8ed5486..00000000000
Binary files a/test/expect/ModelTester.test_raft_stereo_base_expect.pkl and /dev/null differ
diff --git a/test/expect/ModelTester.test_raft_stereo_realtime_expect.pkl b/test/expect/ModelTester.test_raft_stereo_realtime_expect.pkl
deleted file mode 100644
index ea4ec1da706..00000000000
Binary files a/test/expect/ModelTester.test_raft_stereo_realtime_expect.pkl and /dev/null differ
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
deleted file mode 100644
index e9192f44f52..00000000000
--- a/test/prototype_common_utils.py
+++ /dev/null
@@ -1,529 +0,0 @@
-"""This module is separated from common_utils.py to prevent the former to be dependent on torchvision.prototype"""
-
-import collections.abc
-import dataclasses
-import functools
-from typing import Callable, Optional, Sequence, Tuple, Union
-
-import PIL.Image
-import pytest
-import torch
-import torch.testing
-from datasets_utils import combinations_grid
-from torch.nn.functional import one_hot
-from torch.testing._comparison import (
-    assert_equal as _assert_equal,
-    BooleanPair,
-    ErrorMeta,
-    NonePair,
-    NumberPair,
-    TensorLikePair,
-    UnsupportedInputs,
-)
-from torchvision.prototype import features
-from torchvision.prototype.transforms.functional import convert_image_dtype, to_image_tensor
-from torchvision.transforms.functional_tensor import _max_value as get_max_value
-
-__all__ = [
-    "assert_close",
-    "assert_equal",
-    "ArgsKwargs",
-    "make_image_loaders",
-    "make_image",
-    "make_images",
-    "make_bounding_box_loaders",
-    "make_bounding_box",
-    "make_bounding_boxes",
-    "make_label",
-    "make_one_hot_labels",
-    "make_detection_mask_loaders",
-    "make_detection_mask",
-    "make_detection_masks",
-    "make_segmentation_mask_loaders",
-    "make_segmentation_mask",
-    "make_segmentation_masks",
-    "make_mask_loaders",
-    "make_masks",
-]
-
-
-class PILImagePair(TensorLikePair):
-    def __init__(
-        self,
-        actual,
-        expected,
-        *,
-        agg_method=None,
-        allowed_percentage_diff=None,
-        **other_parameters,
-    ):
-        if not any(isinstance(input, PIL.Image.Image) for input in (actual, expected)):
-            raise UnsupportedInputs()
-
-        # This parameter is ignored to enable checking PIL images to tensor images no on the CPU
-        other_parameters["check_device"] = False
-
-        super().__init__(actual, expected, **other_parameters)
-        self.agg_method = getattr(torch, agg_method) if isinstance(agg_method, str) else agg_method
-        self.allowed_percentage_diff = allowed_percentage_diff
-
-    def _process_inputs(self, actual, expected, *, id, allow_subclasses):
-        actual, expected = [
-            to_image_tensor(input) if not isinstance(input, torch.Tensor) else features.Image(input)
-            for input in [actual, expected]
-        ]
-        # This broadcast is needed, because `features.Mask`'s can have a 2D shape, but converting the equivalent PIL
-        # image to a tensor adds a singleton leading dimension.
-        # Although it looks like this belongs in `self._equalize_attributes`, it has to happen here.
-        # `self._equalize_attributes` is called after `super()._compare_attributes` and that has an unconditional
-        # shape check that will fail if we don't broadcast before.
-        try:
-            actual, expected = torch.broadcast_tensors(actual, expected)
-        except RuntimeError:
-            raise ErrorMeta(
-                AssertionError,
-                f"The image shapes are not broadcastable: {actual.shape} != {expected.shape}.",
-                id=id,
-            ) from None
-        return super()._process_inputs(actual, expected, id=id, allow_subclasses=allow_subclasses)
-
-    def _equalize_attributes(self, actual, expected):
-        if actual.dtype != expected.dtype:
-            dtype = torch.promote_types(actual.dtype, expected.dtype)
-            actual = convert_image_dtype(actual, dtype)
-            expected = convert_image_dtype(expected, dtype)
-
-        return super()._equalize_attributes(actual, expected)
-
-    def compare(self) -> None:
-        actual, expected = self.actual, self.expected
-
-        self._compare_attributes(actual, expected)
-
-        actual, expected = self._equalize_attributes(actual, expected)
-        abs_diff = torch.abs(actual - expected)
-
-        if self.allowed_percentage_diff is not None:
-            percentage_diff = (abs_diff != 0).to(torch.float).mean()
-            if percentage_diff > self.allowed_percentage_diff:
-                self._make_error_meta(AssertionError, "percentage mismatch")
-
-        if self.agg_method is None:
-            super()._compare_values(actual, expected)
-        else:
-            err = self.agg_method(abs_diff.to(torch.float64))
-            if err > self.atol:
-                self._make_error_meta(AssertionError, "aggregated mismatch")
-
-
-def assert_close(
-    actual,
-    expected,
-    *,
-    allow_subclasses=True,
-    rtol=None,
-    atol=None,
-    equal_nan=False,
-    check_device=True,
-    check_dtype=True,
-    check_layout=True,
-    check_stride=False,
-    msg=None,
-    **kwargs,
-):
-    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
-    __tracebackhide__ = True
-
-    _assert_equal(
-        actual,
-        expected,
-        pair_types=(
-            NonePair,
-            BooleanPair,
-            NumberPair,
-            PILImagePair,
-            TensorLikePair,
-        ),
-        allow_subclasses=allow_subclasses,
-        rtol=rtol,
-        atol=atol,
-        equal_nan=equal_nan,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride,
-        msg=msg,
-        **kwargs,
-    )
-
-
-assert_equal = functools.partial(assert_close, rtol=0, atol=0)
-
-
-class ArgsKwargs:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-    def __iter__(self):
-        yield self.args
-        yield self.kwargs
-
-    def load(self, device="cpu"):
-        args = tuple(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args)
-        kwargs = {
-            keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg for keyword, arg in self.kwargs.items()
-        }
-        return args, kwargs
-
-
-DEFAULT_SQUARE_IMAGE_SIZE = 15
-DEFAULT_LANDSCAPE_IMAGE_SIZE = (7, 33)
-DEFAULT_PORTRAIT_IMAGE_SIZE = (31, 9)
-DEFAULT_IMAGE_SIZES = (DEFAULT_LANDSCAPE_IMAGE_SIZE, DEFAULT_PORTRAIT_IMAGE_SIZE, DEFAULT_SQUARE_IMAGE_SIZE, "random")
-
-
-def _parse_image_size(size, *, name="size"):
-    if size == "random":
-        return tuple(torch.randint(15, 33, (2,)).tolist())
-    elif isinstance(size, int) and size > 0:
-        return (size, size)
-    elif (
-        isinstance(size, collections.abc.Sequence)
-        and len(size) == 2
-        and all(isinstance(length, int) and length > 0 for length in size)
-    ):
-        return tuple(size)
-    else:
-        raise pytest.UsageError(
-            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
-            f"but got {size} instead."
-        )
-
-
-DEFAULT_EXTRA_DIMS = ((), (0,), (4,), (2, 3), (5, 0), (0, 5))
-
-
-def from_loader(loader_fn):
-    def wrapper(*args, **kwargs):
-        loader = loader_fn(*args, **kwargs)
-        return loader.load(kwargs.get("device", "cpu"))
-
-    return wrapper
-
-
-def from_loaders(loaders_fn):
-    def wrapper(*args, **kwargs):
-        loaders = loaders_fn(*args, **kwargs)
-        for loader in loaders:
-            yield loader.load(kwargs.get("device", "cpu"))
-
-    return wrapper
-
-
-@dataclasses.dataclass
-class TensorLoader:
-    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
-    shape: Sequence[int]
-    dtype: torch.dtype
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device)
-
-
-@dataclasses.dataclass
-class ImageLoader(TensorLoader):
-    color_space: features.ColorSpace
-    image_size: Tuple[int, int] = dataclasses.field(init=False)
-    num_channels: int = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.image_size = self.shape[-2:]
-        self.num_channels = self.shape[-3]
-
-
-def make_image_loader(
-    size="random",
-    *,
-    color_space=features.ColorSpace.RGB,
-    extra_dims=(),
-    dtype=torch.float32,
-    constant_alpha=True,
-):
-    size = _parse_image_size(size)
-
-    try:
-        num_channels = {
-            features.ColorSpace.GRAY: 1,
-            features.ColorSpace.GRAY_ALPHA: 2,
-            features.ColorSpace.RGB: 3,
-            features.ColorSpace.RGB_ALPHA: 4,
-        }[color_space]
-    except KeyError as error:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}") from error
-
-    def fn(shape, dtype, device):
-        max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(shape, low=0, high=max_value, dtype=dtype, device=device)
-        if color_space in {features.ColorSpace.GRAY_ALPHA, features.ColorSpace.RGB_ALPHA} and constant_alpha:
-            data[..., -1, :, :] = max_value
-        return features.Image(data, color_space=color_space)
-
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, color_space=color_space)
-
-
-make_image = from_loader(make_image_loader)
-
-
-def make_image_loaders(
-    *,
-    sizes=DEFAULT_IMAGE_SIZES,
-    color_spaces=(
-        features.ColorSpace.GRAY,
-        features.ColorSpace.GRAY_ALPHA,
-        features.ColorSpace.RGB,
-        features.ColorSpace.RGB_ALPHA,
-    ),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.uint8),
-    constant_alpha=True,
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
-        yield make_image_loader(**params, constant_alpha=constant_alpha)
-
-
-make_images = from_loaders(make_image_loaders)
-
-
-@dataclasses.dataclass
-class BoundingBoxLoader(TensorLoader):
-    format: features.BoundingBoxFormat
-    image_size: Tuple[int, int]
-
-
-def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
-    low, high = torch.broadcast_tensors(
-        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
-    )
-    return torch.stack(
-        [
-            torch.randint(low_scalar, high_scalar, (), **kwargs)
-            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
-        ]
-    ).reshape(low.shape)
-
-
-def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtype=torch.float32):
-    if isinstance(format, str):
-        format = features.BoundingBoxFormat[format]
-    if format not in {
-        features.BoundingBoxFormat.XYXY,
-        features.BoundingBoxFormat.XYWH,
-        features.BoundingBoxFormat.CXCYWH,
-    }:
-        raise pytest.UsageError(f"Can't make bounding box in format {format}")
-
-    image_size = _parse_image_size(image_size, name="image_size")
-
-    def fn(shape, dtype, device):
-        *extra_dims, num_coordinates = shape
-        if num_coordinates != 4:
-            raise pytest.UsageError()
-
-        if any(dim == 0 for dim in extra_dims):
-            return features.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, image_size=image_size
-            )
-
-        height, width = image_size
-
-        if format == features.BoundingBoxFormat.XYXY:
-            x1 = torch.randint(0, width // 2, extra_dims)
-            y1 = torch.randint(0, height // 2, extra_dims)
-            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-            parts = (x1, y1, x2, y2)
-        elif format == features.BoundingBoxFormat.XYWH:
-            x = torch.randint(0, width // 2, extra_dims)
-            y = torch.randint(0, height // 2, extra_dims)
-            w = randint_with_tensor_bounds(1, width - x)
-            h = randint_with_tensor_bounds(1, height - y)
-            parts = (x, y, w, h)
-        else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, ())
-            cy = torch.randint(1, height - 1, ())
-            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-            parts = (cx, cy, w, h)
-
-        return features.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, image_size=image_size
-        )
-
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, image_size=image_size)
-
-
-make_bounding_box = from_loader(make_bounding_box_loader)
-
-
-def make_bounding_box_loaders(
-    *,
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    formats=tuple(features.BoundingBoxFormat),
-    image_size="random",
-    dtypes=(torch.float32, torch.int64),
-):
-    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, image_size=image_size)
-
-
-make_bounding_boxes = from_loaders(make_bounding_box_loaders)
-
-
-@dataclasses.dataclass
-class LabelLoader(TensorLoader):
-    categories: Optional[Sequence[str]]
-
-
-def _parse_categories(categories):
-    if categories is None:
-        num_categories = int(torch.randint(1, 11, ()))
-    elif isinstance(categories, int):
-        num_categories = categories
-        categories = [f"category{idx}" for idx in range(num_categories)]
-    elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories):
-        categories = list(categories)
-        num_categories = len(categories)
-    else:
-        raise pytest.UsageError(
-            f"`categories` can either be `None` (default), an integer, or a sequence of strings, "
-            f"but got '{categories}' instead."
-        )
-    return categories, num_categories
-
-
-def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
-    categories, num_categories = _parse_categories(categories)
-
-    def fn(shape, dtype, device):
-        # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
-        # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
-        return features.Label(data, categories=categories)
-
-    return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)
-
-
-make_label = from_loader(make_label_loader)
-
-
-@dataclasses.dataclass
-class OneHotLabelLoader(TensorLoader):
-    categories: Optional[Sequence[str]]
-
-
-def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int64):
-    categories, num_categories = _parse_categories(categories)
-
-    def fn(shape, dtype, device):
-        if num_categories == 0:
-            data = torch.empty(shape, dtype=dtype, device=device)
-        else:
-            # The idiom `make_label_loader(..., dtype=torch.int64); ...; one_hot(...).to(dtype)` is intentional
-            # since `one_hot` only supports int64
-            label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
-            data = one_hot(label, num_classes=num_categories).to(dtype)
-        return features.OneHotLabel(data, categories=categories)
-
-    return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)
-
-
-def make_one_hot_label_loaders(
-    *,
-    categories=(1, 0, None),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.int64, torch.float32),
-):
-    for params in combinations_grid(categories=categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_one_hot_label_loader(**params)
-
-
-make_one_hot_labels = from_loaders(make_one_hot_label_loaders)
-
-
-class MaskLoader(TensorLoader):
-    pass
-
-
-def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_image_size(size)
-    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
-
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return features.Mask(data)
-
-    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
-
-
-make_detection_mask = from_loader(make_detection_mask_loader)
-
-
-def make_detection_mask_loaders(
-    sizes=DEFAULT_IMAGE_SIZES,
-    num_objects=(1, 0, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
-        yield make_detection_mask_loader(**params)
-
-
-make_detection_masks = from_loaders(make_detection_mask_loaders)
-
-
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_image_size(size)
-    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
-
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return features.Mask(data)
-
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
-
-
-make_segmentation_mask = from_loader(make_segmentation_mask_loader)
-
-
-def make_segmentation_mask_loaders(
-    *,
-    sizes=DEFAULT_IMAGE_SIZES,
-    num_categories=(1, 2, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_segmentation_mask_loader(**params)
-
-
-make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
-
-
-def make_mask_loaders(
-    *,
-    sizes=DEFAULT_IMAGE_SIZES,
-    num_objects=(1, 0, "random"),
-    num_categories=(1, 2, "random"),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
-    yield from make_segmentation_mask_loaders(
-        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
-    )
-
-
-make_masks = from_loaders(make_mask_loaders)
diff --git a/test/prototype_transforms_dispatcher_infos.py b/test/prototype_transforms_dispatcher_infos.py
deleted file mode 100644
index 99a9066be0a..00000000000
--- a/test/prototype_transforms_dispatcher_infos.py
+++ /dev/null
@@ -1,259 +0,0 @@
-import dataclasses
-from collections import defaultdict
-from typing import Callable, Dict, List, Sequence, Type
-
-import pytest
-import torchvision.prototype.transforms.functional as F
-from prototype_transforms_kernel_infos import KERNEL_INFOS, Skip
-from torchvision.prototype import features
-
-__all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
-
-KERNEL_SAMPLE_INPUTS_FN_MAP = {info.kernel: info.sample_inputs_fn for info in KERNEL_INFOS}
-
-
-def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"):
-    return Skip(
-        "test_scripted_smoke",
-        condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs[name], (int, float)),
-        reason=reason,
-    )
-
-
-def skip_integer_size_jit(name="size"):
-    return skip_python_scalar_arg_jit(name, reason="Integer size is not supported when scripting.")
-
-
-@dataclasses.dataclass
-class DispatcherInfo:
-    dispatcher: Callable
-    kernels: Dict[Type, Callable]
-    skips: Sequence[Skip] = dataclasses.field(default_factory=list)
-    _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False)
-
-    def __post_init__(self):
-        skips_map = defaultdict(list)
-        for skip in self.skips:
-            skips_map[skip.test_name].append(skip)
-        self._skips_map = dict(skips_map)
-
-    def sample_inputs(self, *types):
-        for type in types or self.kernels.keys():
-            if type not in self.kernels:
-                raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}")
-
-            yield from KERNEL_SAMPLE_INPUTS_FN_MAP[self.kernels[type]]()
-
-    def maybe_skip(self, *, test_name, args_kwargs, device):
-        skips = self._skips_map.get(test_name)
-        if not skips:
-            return
-
-        for skip in skips:
-            if skip.condition(args_kwargs, device):
-                pytest.skip(skip.reason)
-
-
-DISPATCHER_INFOS = [
-    DispatcherInfo(
-        F.horizontal_flip,
-        kernels={
-            features.Image: F.horizontal_flip_image_tensor,
-            features.BoundingBox: F.horizontal_flip_bounding_box,
-            features.Mask: F.horizontal_flip_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.resize,
-        kernels={
-            features.Image: F.resize_image_tensor,
-            features.BoundingBox: F.resize_bounding_box,
-            features.Mask: F.resize_mask,
-        },
-        skips=[
-            skip_integer_size_jit(),
-        ],
-    ),
-    DispatcherInfo(
-        F.affine,
-        kernels={
-            features.Image: F.affine_image_tensor,
-            features.BoundingBox: F.affine_bounding_box,
-            features.Mask: F.affine_mask,
-        },
-        skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")],
-    ),
-    DispatcherInfo(
-        F.vertical_flip,
-        kernels={
-            features.Image: F.vertical_flip_image_tensor,
-            features.BoundingBox: F.vertical_flip_bounding_box,
-            features.Mask: F.vertical_flip_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.rotate,
-        kernels={
-            features.Image: F.rotate_image_tensor,
-            features.BoundingBox: F.rotate_bounding_box,
-            features.Mask: F.rotate_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.crop,
-        kernels={
-            features.Image: F.crop_image_tensor,
-            features.BoundingBox: F.crop_bounding_box,
-            features.Mask: F.crop_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.resized_crop,
-        kernels={
-            features.Image: F.resized_crop_image_tensor,
-            features.BoundingBox: F.resized_crop_bounding_box,
-            features.Mask: F.resized_crop_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.pad,
-        kernels={
-            features.Image: F.pad_image_tensor,
-            features.BoundingBox: F.pad_bounding_box,
-            features.Mask: F.pad_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.perspective,
-        kernels={
-            features.Image: F.perspective_image_tensor,
-            features.BoundingBox: F.perspective_bounding_box,
-            features.Mask: F.perspective_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.elastic,
-        kernels={
-            features.Image: F.elastic_image_tensor,
-            features.BoundingBox: F.elastic_bounding_box,
-            features.Mask: F.elastic_mask,
-        },
-    ),
-    DispatcherInfo(
-        F.center_crop,
-        kernels={
-            features.Image: F.center_crop_image_tensor,
-            features.BoundingBox: F.center_crop_bounding_box,
-            features.Mask: F.center_crop_mask,
-        },
-        skips=[
-            skip_integer_size_jit("output_size"),
-        ],
-    ),
-    DispatcherInfo(
-        F.gaussian_blur,
-        kernels={
-            features.Image: F.gaussian_blur_image_tensor,
-        },
-        skips=[
-            skip_python_scalar_arg_jit("kernel_size"),
-            skip_python_scalar_arg_jit("sigma"),
-        ],
-    ),
-    DispatcherInfo(
-        F.equalize,
-        kernels={
-            features.Image: F.equalize_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.invert,
-        kernels={
-            features.Image: F.invert_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.posterize,
-        kernels={
-            features.Image: F.posterize_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.solarize,
-        kernels={
-            features.Image: F.solarize_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.autocontrast,
-        kernels={
-            features.Image: F.autocontrast_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.adjust_sharpness,
-        kernels={
-            features.Image: F.adjust_sharpness_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.erase,
-        kernels={
-            features.Image: F.erase_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.adjust_brightness,
-        kernels={
-            features.Image: F.adjust_brightness_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.adjust_contrast,
-        kernels={
-            features.Image: F.adjust_contrast_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.adjust_gamma,
-        kernels={
-            features.Image: F.adjust_gamma_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.adjust_hue,
-        kernels={
-            features.Image: F.adjust_hue_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.adjust_saturation,
-        kernels={
-            features.Image: F.adjust_saturation_image_tensor,
-        },
-    ),
-    DispatcherInfo(
-        F.five_crop,
-        kernels={
-            features.Image: F.five_crop_image_tensor,
-        },
-        skips=[
-            skip_integer_size_jit(),
-        ],
-    ),
-    DispatcherInfo(
-        F.ten_crop,
-        kernels={
-            features.Image: F.ten_crop_image_tensor,
-        },
-        skips=[
-            skip_integer_size_jit(),
-        ],
-    ),
-    DispatcherInfo(
-        F.normalize,
-        kernels={
-            features.Image: F.normalize_image_tensor,
-        },
-    ),
-]
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
deleted file mode 100644
index 3f050ad8f7d..00000000000
--- a/test/prototype_transforms_kernel_infos.py
+++ /dev/null
@@ -1,1594 +0,0 @@
-import dataclasses
-import functools
-import itertools
-import math
-from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
-
-import numpy as np
-import pytest
-import torch.testing
-import torchvision.ops
-import torchvision.prototype.transforms.functional as F
-from datasets_utils import combinations_grid
-from prototype_common_utils import ArgsKwargs, make_bounding_box_loaders, make_image_loaders, make_mask_loaders
-from torchvision.prototype import features
-from torchvision.transforms.functional_tensor import _max_value as get_max_value
-
-__all__ = ["KernelInfo", "KERNEL_INFOS"]
-
-
-@dataclasses.dataclass
-class Skip:
-    test_name: str
-    reason: str
-    condition: Callable[[ArgsKwargs, str], bool] = lambda args_kwargs, device: True
-
-
-@dataclasses.dataclass
-class KernelInfo:
-    kernel: Callable
-    # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but should
-    # not include extensive parameter combinations to keep to overall test count moderate.
-    sample_inputs_fn: Callable[[], Iterable[ArgsKwargs]]
-    # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
-    # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
-    kernel_name: Optional[str] = None
-    # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also take
-    # tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should happen
-    # inside the function. It should return a tensor or to be more precise an object that can be compared to a
-    # tensor by `assert_close`. If omitted, no reference test will be performed.
-    reference_fn: Optional[Callable] = None
-    # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter
-    # values to be tested. If not specified, `sample_inputs_fn` will be used.
-    reference_inputs_fn: Optional[Callable[[], Iterable[ArgsKwargs]]] = None
-    # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`.
-    closeness_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    skips: Sequence[Skip] = dataclasses.field(default_factory=list)
-    _skips_map: Dict[str, List[Skip]] = dataclasses.field(default=None, init=False)
-
-    def __post_init__(self):
-        self.kernel_name = self.kernel_name or self.kernel.__name__
-        self.reference_inputs_fn = self.reference_inputs_fn or self.sample_inputs_fn
-
-        skips_map = defaultdict(list)
-        for skip in self.skips:
-            skips_map[skip.test_name].append(skip)
-        self._skips_map = dict(skips_map)
-
-    def maybe_skip(self, *, test_name, args_kwargs, device):
-        skips = self._skips_map.get(test_name)
-        if not skips:
-            return
-
-        for skip in skips:
-            if skip.condition(args_kwargs, device):
-                pytest.skip(skip.reason)
-
-
-DEFAULT_IMAGE_CLOSENESS_KWARGS = dict(
-    atol=1e-5,
-    rtol=0,
-    agg_method="mean",
-)
-
-
-def pil_reference_wrapper(pil_kernel):
-    @functools.wraps(pil_kernel)
-    def wrapper(image_tensor, *other_args, **kwargs):
-        if image_tensor.ndim > 3:
-            raise pytest.UsageError(
-                f"Can only test single tensor images against PIL, but input has shape {image_tensor.shape}"
-            )
-
-        # We don't need to convert back to tensor here, since `assert_close` does that automatically.
-        return pil_kernel(F.to_image_pil(image_tensor), *other_args, **kwargs)
-
-    return wrapper
-
-
-def skip_python_scalar_arg_jit(name, *, reason="Python scalar int or float is not supported when scripting"):
-    return Skip(
-        "test_scripted_vs_eager",
-        condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs[name], (int, float)),
-        reason=reason,
-    )
-
-
-def skip_integer_size_jit(name="size"):
-    return skip_python_scalar_arg_jit(name, reason="Integer size is not supported when scripting.")
-
-
-KERNEL_INFOS = []
-
-
-def sample_inputs_horizontal_flip_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.float32]):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_horizontal_flip_image_tensor():
-    for image_loader in make_image_loaders(extra_dims=[()]):
-        yield ArgsKwargs(image_loader)
-
-
-def sample_inputs_horizontal_flip_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(
-        formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
-        )
-
-
-def sample_inputs_horizontal_flip_mask():
-    for image_loader in make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]):
-        yield ArgsKwargs(image_loader)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.horizontal_flip_image_tensor,
-            kernel_name="horizontal_flip_image_tensor",
-            sample_inputs_fn=sample_inputs_horizontal_flip_image_tensor,
-            reference_fn=pil_reference_wrapper(F.horizontal_flip_image_pil),
-            reference_inputs_fn=reference_inputs_horizontal_flip_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.horizontal_flip_bounding_box,
-            sample_inputs_fn=sample_inputs_horizontal_flip_bounding_box,
-        ),
-        KernelInfo(
-            F.horizontal_flip_mask,
-            sample_inputs_fn=sample_inputs_horizontal_flip_mask,
-        ),
-    ]
-)
-
-
-def _get_resize_sizes(image_size):
-    height, width = image_size
-    length = max(image_size)
-    # FIXME: enable me when the kernels are fixed
-    # yield length
-    yield [length]
-    yield (length,)
-    new_height = int(height * 0.75)
-    new_width = int(width * 1.25)
-    yield [new_height, new_width]
-    yield height, width
-
-
-def sample_inputs_resize_image_tensor():
-    for image_loader, interpolation in itertools.product(
-        make_image_loaders(dtypes=[torch.float32]),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BICUBIC,
-        ],
-    ):
-        for size in _get_resize_sizes(image_loader.image_size):
-            yield ArgsKwargs(image_loader, size=size, interpolation=interpolation)
-
-
-@pil_reference_wrapper
-def reference_resize_image_tensor(*args, **kwargs):
-    if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in {
-        F.InterpolationMode.BILINEAR,
-        F.InterpolationMode.BICUBIC,
-    }:
-        raise pytest.UsageError("Anti-aliasing is always active in PIL")
-    return F.resize_image_pil(*args, **kwargs)
-
-
-def reference_inputs_resize_image_tensor():
-    for image_loader, interpolation in itertools.product(
-        make_image_loaders(extra_dims=[()]),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-            F.InterpolationMode.BICUBIC,
-        ],
-    ):
-        for size in _get_resize_sizes(image_loader.image_size):
-            yield ArgsKwargs(
-                image_loader,
-                size=size,
-                interpolation=interpolation,
-                antialias=interpolation
-                in {
-                    F.InterpolationMode.BILINEAR,
-                    F.InterpolationMode.BICUBIC,
-                },
-            )
-
-
-def sample_inputs_resize_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(formats=[features.BoundingBoxFormat.XYXY]):
-        for size in _get_resize_sizes(bounding_box_loader.image_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size)
-
-
-def sample_inputs_resize_mask():
-    for mask_loader in make_mask_loaders(dtypes=[torch.uint8]):
-        for size in _get_resize_sizes(mask_loader.shape[-2:]):
-            yield ArgsKwargs(mask_loader, size=size)
-
-
-@pil_reference_wrapper
-def reference_resize_mask(*args, **kwargs):
-    return F.resize_image_pil(*args, interpolation=F.InterpolationMode.NEAREST, **kwargs)
-
-
-def reference_inputs_resize_mask():
-    for mask_loader in make_mask_loaders(extra_dims=[()], num_objects=[1]):
-        for size in _get_resize_sizes(mask_loader.shape[-2:]):
-            yield ArgsKwargs(mask_loader, size=size)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.resize_image_tensor,
-            sample_inputs_fn=sample_inputs_resize_image_tensor,
-            reference_fn=reference_resize_image_tensor,
-            reference_inputs_fn=reference_inputs_resize_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit(),
-            ],
-        ),
-        KernelInfo(
-            F.resize_bounding_box,
-            sample_inputs_fn=sample_inputs_resize_bounding_box,
-            skips=[
-                skip_integer_size_jit(),
-            ],
-        ),
-        KernelInfo(
-            F.resize_mask,
-            sample_inputs_fn=sample_inputs_resize_mask,
-            reference_fn=reference_resize_mask,
-            reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit(),
-            ],
-        ),
-    ]
-)
-
-
-_AFFINE_KWARGS = combinations_grid(
-    angle=[-87, 15, 90],
-    translate=[(5, 5), (-5, -5)],
-    scale=[0.77, 1.27],
-    shear=[(12, 12), (0, 0)],
-)
-
-
-def _diversify_affine_kwargs_types(affine_kwargs):
-    angle = affine_kwargs["angle"]
-    for diverse_angle in [int(angle), float(angle)]:
-        yield dict(affine_kwargs, angle=diverse_angle)
-
-    shear = affine_kwargs["shear"]
-    for diverse_shear in [tuple(shear), list(shear), int(shear[0]), float(shear[0])]:
-        yield dict(affine_kwargs, shear=diverse_shear)
-
-
-def sample_inputs_affine_image_tensor():
-    for image_loader, interpolation_mode, center in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.float32]),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-        ],
-        [None, (0, 0)],
-    ):
-        for fill in [None, 128.0, 128, [12.0], [0.5] * image_loader.num_channels]:
-            yield ArgsKwargs(
-                image_loader,
-                interpolation=interpolation_mode,
-                center=center,
-                fill=fill,
-                **_AFFINE_KWARGS[0],
-            )
-
-    for image_loader, affine_kwargs in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.float32]), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0])
-    ):
-        yield ArgsKwargs(image_loader, **affine_kwargs)
-
-
-def reference_inputs_affine_image_tensor():
-    for image_loader, affine_kwargs in itertools.product(make_image_loaders(extra_dims=[()]), _AFFINE_KWARGS):
-        yield ArgsKwargs(
-            image_loader,
-            interpolation=F.InterpolationMode.NEAREST,
-            **affine_kwargs,
-        )
-
-
-def sample_inputs_affine_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
-            **_AFFINE_KWARGS[0],
-        )
-
-    for bounding_box_loader, affine_kwargs in itertools.product(
-        make_bounding_box_loaders(), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0])
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
-            **affine_kwargs,
-        )
-
-
-def _compute_affine_matrix(angle, translate, scale, shear, center):
-    rot = math.radians(angle)
-    cx, cy = center
-    tx, ty = translate
-    sx, sy = [math.radians(sh_) for sh_ in shear]
-
-    c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
-    t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
-    c_matrix_inv = np.linalg.inv(c_matrix)
-    rs_matrix = np.array(
-        [
-            [scale * math.cos(rot), -scale * math.sin(rot), 0],
-            [scale * math.sin(rot), scale * math.cos(rot), 0],
-            [0, 0, 1],
-        ]
-    )
-    shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
-    shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
-    rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
-    true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
-    return true_matrix
-
-
-def reference_affine_bounding_box(bounding_box, *, format, image_size, angle, translate, scale, shear, center=None):
-    if center is None:
-        center = [s * 0.5 for s in image_size[::-1]]
-
-    def transform(bbox):
-        affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
-        affine_matrix = affine_matrix[:2, :]
-
-        bbox_xyxy = F.convert_format_bounding_box(bbox, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-            ]
-        )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = torch.tensor(
-            [
-                np.min(transformed_points[:, 0]),
-                np.min(transformed_points[:, 1]),
-                np.max(transformed_points[:, 0]),
-                np.max(transformed_points[:, 1]),
-            ],
-            dtype=bbox.dtype,
-        )
-        return F.convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        )
-
-    if bounding_box.ndim < 2:
-        bounding_box = [bounding_box]
-
-    expected_bboxes = [transform(bbox) for bbox in bounding_box]
-    if len(expected_bboxes) > 1:
-        expected_bboxes = torch.stack(expected_bboxes)
-    else:
-        expected_bboxes = expected_bboxes[0]
-
-    return expected_bboxes
-
-
-def reference_inputs_affine_bounding_box():
-    for bounding_box_loader, affine_kwargs in itertools.product(
-        make_bounding_box_loaders(extra_dims=[()]),
-        _AFFINE_KWARGS,
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
-            **affine_kwargs,
-        )
-
-
-def sample_inputs_affine_image_mask():
-    for mask_loader, center in itertools.product(
-        make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]),
-        [None, (0, 0)],
-    ):
-        yield ArgsKwargs(mask_loader, center=center, **_AFFINE_KWARGS[0])
-
-    for mask_loader, affine_kwargs in itertools.product(
-        make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]), _diversify_affine_kwargs_types(_AFFINE_KWARGS[0])
-    ):
-        yield ArgsKwargs(mask_loader, **affine_kwargs)
-
-
-@pil_reference_wrapper
-def reference_affine_mask(*args, **kwargs):
-    return F.affine_image_pil(*args, interpolation=F.InterpolationMode.NEAREST, **kwargs)
-
-
-def reference_inputs_resize_mask():
-    for mask_loader, affine_kwargs in itertools.product(
-        make_mask_loaders(extra_dims=[()], num_objects=[1]), _AFFINE_KWARGS
-    ):
-        yield ArgsKwargs(mask_loader, **affine_kwargs)
-
-
-# FIXME: @datumbox, remove this as soon as you have fixed the behavior in https://github.com/pytorch/vision/pull/6636
-def skip_scalar_shears(*test_names):
-    for test_name in test_names:
-        yield Skip(
-            test_name,
-            condition=lambda args_kwargs, device: isinstance(args_kwargs.kwargs["shear"], (int, float)),
-            reason="The kernel is broken for a scalar `shear`",
-        )
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.affine_image_tensor,
-            sample_inputs_fn=sample_inputs_affine_image_tensor,
-            reference_fn=pil_reference_wrapper(F.affine_image_pil),
-            reference_inputs_fn=reference_inputs_affine_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")],
-        ),
-        KernelInfo(
-            F.affine_bounding_box,
-            sample_inputs_fn=sample_inputs_affine_bounding_box,
-            reference_fn=reference_affine_bounding_box,
-            reference_inputs_fn=reference_inputs_affine_bounding_box,
-            closeness_kwargs=dict(atol=1, rtol=0),
-            skips=[
-                skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT"),
-                *skip_scalar_shears(
-                    "test_batched_vs_single",
-                    "test_no_inplace",
-                    "test_dtype_and_device_consistency",
-                ),
-            ],
-        ),
-        KernelInfo(
-            F.affine_mask,
-            sample_inputs_fn=sample_inputs_affine_image_mask,
-            reference_fn=reference_affine_mask,
-            reference_inputs_fn=reference_inputs_resize_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[skip_python_scalar_arg_jit("shear", reason="Scalar shear is not supported by JIT")],
-        ),
-    ]
-)
-
-
-def sample_inputs_convert_format_bounding_box():
-    formats = set(features.BoundingBoxFormat)
-    for bounding_box_loader in make_bounding_box_loaders(formats=formats):
-        old_format = bounding_box_loader.format
-        for params in combinations_grid(new_format=formats - {old_format}, copy=(True, False)):
-            yield ArgsKwargs(bounding_box_loader, old_format=old_format, **params)
-
-
-def reference_convert_format_bounding_box(bounding_box, old_format, new_format, copy):
-    if not copy:
-        raise pytest.UsageError("Reference for `convert_format_bounding_box` only supports `copy=True`")
-
-    return torchvision.ops.box_convert(
-        bounding_box, in_fmt=old_format.kernel_name.lower(), out_fmt=new_format.kernel_name.lower()
-    )
-
-
-def reference_inputs_convert_format_bounding_box():
-    for args_kwargs in sample_inputs_convert_color_space_image_tensor():
-        (image_loader, *other_args), kwargs = args_kwargs
-        if len(image_loader.shape) == 2 and kwargs.setdefault("copy", True):
-            yield args_kwargs
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.convert_format_bounding_box,
-        sample_inputs_fn=sample_inputs_convert_format_bounding_box,
-        reference_fn=reference_convert_format_bounding_box,
-        reference_inputs_fn=reference_inputs_convert_format_bounding_box,
-    ),
-)
-
-
-def sample_inputs_convert_color_space_image_tensor():
-    color_spaces = set(features.ColorSpace) - {features.ColorSpace.OTHER}
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=color_spaces, constant_alpha=True):
-        old_color_space = image_loader.color_space
-        for params in combinations_grid(new_color_space=color_spaces - {old_color_space}, copy=(True, False)):
-            yield ArgsKwargs(image_loader, old_color_space=old_color_space, **params)
-
-
-@pil_reference_wrapper
-def reference_convert_color_space_image_tensor(image_pil, old_color_space, new_color_space, copy):
-    color_space_pil = features.ColorSpace.from_pil_mode(image_pil.mode)
-    if color_space_pil != old_color_space:
-        raise pytest.UsageError(
-            f"Converting the tensor image into an PIL image changed the colorspace "
-            f"from {old_color_space} to {color_space_pil}"
-        )
-
-    return F.convert_color_space_image_pil(image_pil, color_space=new_color_space, copy=copy)
-
-
-def reference_inputs_convert_color_space_image_tensor():
-    for args_kwargs in sample_inputs_convert_color_space_image_tensor():
-        (image_loader, *other_args), kwargs = args_kwargs
-        if len(image_loader.shape) == 3:
-            yield args_kwargs
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.convert_color_space_image_tensor,
-        sample_inputs_fn=sample_inputs_convert_color_space_image_tensor,
-        reference_fn=reference_convert_color_space_image_tensor,
-        reference_inputs_fn=reference_inputs_convert_color_space_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    ),
-)
-
-
-def sample_inputs_vertical_flip_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.float32]):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_vertical_flip_image_tensor():
-    for image_loader in make_image_loaders(extra_dims=[()]):
-        yield ArgsKwargs(image_loader)
-
-
-def sample_inputs_vertical_flip_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders(
-        formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
-    ):
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
-        )
-
-
-def sample_inputs_vertical_flip_mask():
-    for image_loader in make_mask_loaders(sizes=["random"], dtypes=[torch.uint8]):
-        yield ArgsKwargs(image_loader)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.vertical_flip_image_tensor,
-            kernel_name="vertical_flip_image_tensor",
-            sample_inputs_fn=sample_inputs_vertical_flip_image_tensor,
-            reference_fn=pil_reference_wrapper(F.vertical_flip_image_pil),
-            reference_inputs_fn=reference_inputs_vertical_flip_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.vertical_flip_bounding_box,
-            sample_inputs_fn=sample_inputs_vertical_flip_bounding_box,
-        ),
-        KernelInfo(
-            F.vertical_flip_mask,
-            sample_inputs_fn=sample_inputs_vertical_flip_mask,
-        ),
-    ]
-)
-
-_ROTATE_ANGLES = [-87, 15, 90]
-
-
-def sample_inputs_rotate_image_tensor():
-    for image_loader, params in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.float32]),
-        combinations_grid(
-            interpolation=[F.InterpolationMode.NEAREST, F.InterpolationMode.BILINEAR],
-            expand=[True, False],
-            center=[None, (0, 0)],
-        ),
-    ):
-        if params["center"] is not None and params["expand"]:
-            # Otherwise this will emit a warning and ignore center anyway
-            continue
-
-        for fill in [None, 0.5, [0.5] * image_loader.num_channels]:
-            yield ArgsKwargs(
-                image_loader,
-                angle=_ROTATE_ANGLES[0],
-                fill=fill,
-                **params,
-            )
-
-
-def reference_inputs_rotate_image_tensor():
-    for image_loader, angle in itertools.product(make_image_loaders(extra_dims=[()]), _ROTATE_ANGLES):
-        yield ArgsKwargs(image_loader, angle=angle)
-
-
-def sample_inputs_rotate_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
-            angle=_ROTATE_ANGLES[0],
-        )
-
-
-def sample_inputs_rotate_mask():
-    for image_loader, params in itertools.product(
-        make_image_loaders(sizes=["random"], dtypes=[torch.uint8]),
-        combinations_grid(
-            expand=[True, False],
-            center=[None, (0, 0)],
-        ),
-    ):
-        if params["center"] is not None and params["expand"]:
-            # Otherwise this will emit a warning and ignore center anyway
-            continue
-
-        yield ArgsKwargs(
-            image_loader,
-            angle=_ROTATE_ANGLES[0],
-            **params,
-        )
-
-
-@pil_reference_wrapper
-def reference_rotate_mask(*args, **kwargs):
-    return F.rotate_image_pil(*args, interpolation=F.InterpolationMode.NEAREST, **kwargs)
-
-
-def reference_inputs_rotate_mask():
-    for mask_loader, angle in itertools.product(make_mask_loaders(extra_dims=[()], num_objects=[1]), _ROTATE_ANGLES):
-        yield ArgsKwargs(mask_loader, angle=angle)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.rotate_image_tensor,
-            sample_inputs_fn=sample_inputs_rotate_image_tensor,
-            reference_fn=pil_reference_wrapper(F.rotate_image_pil),
-            reference_inputs_fn=reference_inputs_rotate_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.rotate_bounding_box,
-            sample_inputs_fn=sample_inputs_rotate_bounding_box,
-        ),
-        KernelInfo(
-            F.rotate_mask,
-            sample_inputs_fn=sample_inputs_rotate_mask,
-            reference_fn=reference_rotate_mask,
-            reference_inputs_fn=reference_inputs_rotate_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-_CROP_PARAMS = combinations_grid(top=[-8, 0, 9], left=[-8, 0, 9], height=[12, 20], width=[12, 20])
-
-
-def sample_inputs_crop_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]):
-        yield ArgsKwargs(image_loader, **params)
-
-
-def reference_inputs_crop_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(extra_dims=[()]), _CROP_PARAMS):
-        yield ArgsKwargs(image_loader, **params)
-
-
-def sample_inputs_crop_bounding_box():
-    for bounding_box_loader, params in itertools.product(
-        make_bounding_box_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
-    ):
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
-
-
-def sample_inputs_crop_mask():
-    for mask_loader, params in itertools.product(make_mask_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]):
-        yield ArgsKwargs(mask_loader, **params)
-
-
-def reference_inputs_crop_mask():
-    for mask_loader, params in itertools.product(make_mask_loaders(extra_dims=[()], num_objects=[1]), _CROP_PARAMS):
-        yield ArgsKwargs(mask_loader, **params)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.crop_image_tensor,
-            kernel_name="crop_image_tensor",
-            sample_inputs_fn=sample_inputs_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.crop_image_pil),
-            reference_inputs_fn=reference_inputs_crop_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.crop_bounding_box,
-            sample_inputs_fn=sample_inputs_crop_bounding_box,
-        ),
-        KernelInfo(
-            F.crop_mask,
-            sample_inputs_fn=sample_inputs_crop_mask,
-            reference_fn=pil_reference_wrapper(F.crop_image_pil),
-            reference_inputs_fn=reference_inputs_crop_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-_RESIZED_CROP_PARAMS = combinations_grid(top=[-8, 9], left=[-8, 9], height=[12], width=[12], size=[(16, 18)])
-
-
-def sample_inputs_resized_crop_image_tensor():
-    for image_loader in make_image_loaders():
-        yield ArgsKwargs(image_loader, **_RESIZED_CROP_PARAMS[0])
-
-
-@pil_reference_wrapper
-def reference_resized_crop_image_tensor(*args, **kwargs):
-    if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in {
-        F.InterpolationMode.BILINEAR,
-        F.InterpolationMode.BICUBIC,
-    }:
-        raise pytest.UsageError("Anti-aliasing is always active in PIL")
-    return F.resized_crop_image_pil(*args, **kwargs)
-
-
-def reference_inputs_resized_crop_image_tensor():
-    for image_loader, interpolation, params in itertools.product(
-        make_image_loaders(extra_dims=[()]),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-            F.InterpolationMode.BICUBIC,
-        ],
-        _RESIZED_CROP_PARAMS,
-    ):
-        yield ArgsKwargs(
-            image_loader,
-            interpolation=interpolation,
-            antialias=interpolation
-            in {
-                F.InterpolationMode.BILINEAR,
-                F.InterpolationMode.BICUBIC,
-            },
-            **params,
-        )
-
-
-def sample_inputs_resized_crop_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **_RESIZED_CROP_PARAMS[0])
-
-
-def sample_inputs_resized_crop_mask():
-    for mask_loader in make_mask_loaders():
-        yield ArgsKwargs(mask_loader, **_RESIZED_CROP_PARAMS[0])
-
-
-def reference_inputs_resized_crop_mask():
-    for mask_loader, params in itertools.product(
-        make_mask_loaders(extra_dims=[()], num_objects=[1]), _RESIZED_CROP_PARAMS
-    ):
-        yield ArgsKwargs(mask_loader, **params)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.resized_crop_image_tensor,
-            sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
-            reference_fn=reference_resized_crop_image_tensor,
-            reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.resized_crop_bounding_box,
-            sample_inputs_fn=sample_inputs_resized_crop_bounding_box,
-        ),
-        KernelInfo(
-            F.resized_crop_mask,
-            sample_inputs_fn=sample_inputs_resized_crop_mask,
-            reference_fn=pil_reference_wrapper(F.resized_crop_image_pil),
-            reference_inputs_fn=reference_inputs_resized_crop_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-_PAD_PARAMS = combinations_grid(
-    padding=[[1], [1, 1], [1, 1, 2, 2]],
-    padding_mode=["constant", "symmetric", "edge", "reflect"],
-)
-
-
-def sample_inputs_pad_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(sizes=["random"]), _PAD_PARAMS):
-        fills = [None, 128.0, 128, [12.0]]
-        if params["padding_mode"] == "constant":
-            fills.append([12.0 + c for c in range(image_loader.num_channels)])
-        for fill in fills:
-            yield ArgsKwargs(image_loader, fill=fill, **params)
-
-
-def reference_inputs_pad_image_tensor():
-    for image_loader, params in itertools.product(make_image_loaders(extra_dims=[()]), _PAD_PARAMS):
-        # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
-        fills = [None, 128.0, 128]
-        if params["padding_mode"] == "constant":
-            fills.append([12.0 + c for c in range(image_loader.num_channels)])
-        for fill in fills:
-            yield ArgsKwargs(image_loader, fill=fill, **params)
-
-
-def sample_inputs_pad_bounding_box():
-    for bounding_box_loader, params in itertools.product(make_bounding_box_loaders(), _PAD_PARAMS):
-        if params["padding_mode"] != "constant":
-            continue
-
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size, **params
-        )
-
-
-def sample_inputs_pad_mask():
-    for image_loader, fill, params in itertools.product(make_mask_loaders(sizes=["random"]), [None, 127], _PAD_PARAMS):
-        yield ArgsKwargs(image_loader, fill=fill, **params)
-
-
-def reference_inputs_pad_mask():
-    for image_loader, fill, params in itertools.product(make_image_loaders(extra_dims=[()]), [None, 127], _PAD_PARAMS):
-        yield ArgsKwargs(image_loader, fill=fill, **params)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.pad_image_tensor,
-            sample_inputs_fn=sample_inputs_pad_image_tensor,
-            reference_fn=pil_reference_wrapper(F.pad_image_pil),
-            reference_inputs_fn=reference_inputs_pad_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.pad_bounding_box,
-            sample_inputs_fn=sample_inputs_pad_bounding_box,
-        ),
-        KernelInfo(
-            F.pad_mask,
-            sample_inputs_fn=sample_inputs_pad_mask,
-            reference_fn=pil_reference_wrapper(F.pad_image_pil),
-            reference_inputs_fn=reference_inputs_pad_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-_PERSPECTIVE_COEFFS = [
-    [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
-    [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
-]
-
-
-def sample_inputs_perspective_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
-        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
-
-
-def reference_inputs_perspective_image_tensor():
-    for image_loader, perspective_coeffs in itertools.product(make_image_loaders(extra_dims=[()]), _PERSPECTIVE_COEFFS):
-        # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
-        for fill in [None, 128.0, 128, [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=perspective_coeffs)
-
-
-def sample_inputs_perspective_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, perspective_coeffs=_PERSPECTIVE_COEFFS[0]
-        )
-
-
-def sample_inputs_perspective_mask():
-    for mask_loader in make_mask_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
-        yield ArgsKwargs(mask_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
-
-
-def reference_inputs_perspective_mask():
-    for mask_loader, perspective_coeffs in itertools.product(
-        make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
-    ):
-        yield ArgsKwargs(mask_loader, perspective_coeffs=perspective_coeffs)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.perspective_image_tensor,
-            sample_inputs_fn=sample_inputs_perspective_image_tensor,
-            reference_fn=pil_reference_wrapper(F.perspective_image_pil),
-            reference_inputs_fn=reference_inputs_perspective_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.perspective_bounding_box,
-            sample_inputs_fn=sample_inputs_perspective_bounding_box,
-        ),
-        KernelInfo(
-            F.perspective_mask,
-            sample_inputs_fn=sample_inputs_perspective_mask,
-            reference_fn=pil_reference_wrapper(F.perspective_image_pil),
-            reference_inputs_fn=reference_inputs_perspective_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-
-def _get_elastic_displacement(image_size):
-    return torch.rand(1, *image_size, 2)
-
-
-def sample_inputs_elastic_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
-        displacement = _get_elastic_displacement(image_loader.image_size)
-        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
-
-
-def reference_inputs_elastic_image_tensor():
-    for image_loader, interpolation in itertools.product(
-        make_image_loaders(extra_dims=[()]),
-        [
-            F.InterpolationMode.NEAREST,
-            F.InterpolationMode.BILINEAR,
-            F.InterpolationMode.BICUBIC,
-        ],
-    ):
-        displacement = _get_elastic_displacement(image_loader.image_size)
-        for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
-
-
-def sample_inputs_elastic_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        displacement = _get_elastic_displacement(bounding_box_loader.image_size)
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            displacement=displacement,
-        )
-
-
-def sample_inputs_elastic_mask():
-    for mask_loader in make_mask_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
-        displacement = _get_elastic_displacement(mask_loader.shape[-2:])
-        yield ArgsKwargs(mask_loader, displacement=displacement)
-
-
-def reference_inputs_elastic_mask():
-    for mask_loader in make_mask_loaders(extra_dims=[()], num_objects=[1]):
-        displacement = _get_elastic_displacement(mask_loader.shape[-2:])
-        yield ArgsKwargs(mask_loader, displacement=displacement)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.elastic_image_tensor,
-            sample_inputs_fn=sample_inputs_elastic_image_tensor,
-            reference_fn=pil_reference_wrapper(F.elastic_image_pil),
-            reference_inputs_fn=reference_inputs_elastic_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.elastic_bounding_box,
-            sample_inputs_fn=sample_inputs_elastic_bounding_box,
-        ),
-        KernelInfo(
-            F.elastic_mask,
-            sample_inputs_fn=sample_inputs_elastic_mask,
-            reference_fn=pil_reference_wrapper(F.elastic_image_pil),
-            reference_inputs_fn=reference_inputs_elastic_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-
-_CENTER_CROP_IMAGE_SIZES = [(16, 16), (7, 33), (31, 9)]
-_CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
-
-
-def sample_inputs_center_crop_image_tensor():
-    for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES), _CENTER_CROP_OUTPUT_SIZES
-    ):
-        yield ArgsKwargs(image_loader, output_size=output_size)
-
-
-def reference_inputs_center_crop_image_tensor():
-    for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
-    ):
-        yield ArgsKwargs(image_loader, output_size=output_size)
-
-
-def sample_inputs_center_crop_bounding_box():
-    for bounding_box_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES):
-        yield ArgsKwargs(
-            bounding_box_loader,
-            format=bounding_box_loader.format,
-            image_size=bounding_box_loader.image_size,
-            output_size=output_size,
-        )
-
-
-def sample_inputs_center_crop_mask():
-    for mask_loader, output_size in itertools.product(
-        make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES), _CENTER_CROP_OUTPUT_SIZES
-    ):
-        yield ArgsKwargs(mask_loader, output_size=output_size)
-
-
-def reference_inputs_center_crop_mask():
-    for mask_loader, output_size in itertools.product(
-        make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
-    ):
-        yield ArgsKwargs(mask_loader, output_size=output_size)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.center_crop_image_tensor,
-            sample_inputs_fn=sample_inputs_center_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
-            reference_inputs_fn=reference_inputs_center_crop_image_tensor,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit("output_size"),
-            ],
-        ),
-        KernelInfo(
-            F.center_crop_bounding_box,
-            sample_inputs_fn=sample_inputs_center_crop_bounding_box,
-            skips=[
-                skip_integer_size_jit("output_size"),
-            ],
-        ),
-        KernelInfo(
-            F.center_crop_mask,
-            sample_inputs_fn=sample_inputs_center_crop_mask,
-            reference_fn=pil_reference_wrapper(F.center_crop_image_pil),
-            reference_inputs_fn=reference_inputs_center_crop_mask,
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-            skips=[
-                skip_integer_size_jit("output_size"),
-            ],
-        ),
-    ]
-)
-
-
-def sample_inputs_gaussian_blur_image_tensor():
-    for image_loader, params in itertools.product(
-        make_image_loaders(
-            sizes=["random"],
-            # FIXME: kernel should support arbitrary batch sizes
-            extra_dims=[(), (4,)],
-        ),
-        combinations_grid(
-            kernel_size=[(3, 3), [3, 3], 5],
-            sigma=[None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)],
-        ),
-    ):
-        yield ArgsKwargs(image_loader, **params)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.gaussian_blur_image_tensor,
-        sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        skips=[
-            skip_python_scalar_arg_jit("kernel_size"),
-            skip_python_scalar_arg_jit("sigma"),
-        ],
-    )
-)
-
-
-def sample_inputs_equalize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"],
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB),
-        dtypes=[torch.uint8],
-    ):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_equalize_image_tensor():
-    for image_loader in make_image_loaders(
-        extra_dims=[()], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8]
-    ):
-        yield ArgsKwargs(image_loader)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.equalize_image_tensor,
-        kernel_name="equalize_image_tensor",
-        sample_inputs_fn=sample_inputs_equalize_image_tensor,
-        reference_fn=pil_reference_wrapper(F.equalize_image_pil),
-        reference_inputs_fn=reference_inputs_equalize_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-def sample_inputs_invert_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_invert_image_tensor():
-    for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]
-    ):
-        yield ArgsKwargs(image_loader)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.invert_image_tensor,
-        kernel_name="invert_image_tensor",
-        sample_inputs_fn=sample_inputs_invert_image_tensor,
-        reference_fn=pil_reference_wrapper(F.invert_image_pil),
-        reference_inputs_fn=reference_inputs_invert_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-_POSTERIZE_BITS = [1, 4, 8]
-
-
-def sample_inputs_posterize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), dtypes=[torch.uint8]
-    ):
-        yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
-
-
-def reference_inputs_posterize_image_tensor():
-    for image_loader, bits in itertools.product(
-        make_image_loaders(
-            color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()], dtypes=[torch.uint8]
-        ),
-        _POSTERIZE_BITS,
-    ):
-        yield ArgsKwargs(image_loader, bits=bits)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.posterize_image_tensor,
-        kernel_name="posterize_image_tensor",
-        sample_inputs_fn=sample_inputs_posterize_image_tensor,
-        reference_fn=pil_reference_wrapper(F.posterize_image_pil),
-        reference_inputs_fn=reference_inputs_posterize_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-def _get_solarize_thresholds(dtype):
-    for factor in [0.1, 0.5]:
-        max_value = get_max_value(dtype)
-        yield (float if dtype.is_floating_point else int)(max_value * factor)
-
-
-def sample_inputs_solarize_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))
-
-
-def reference_inputs_solarize_image_tensor():
-    for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]
-    ):
-        for threshold in _get_solarize_thresholds(image_loader.dtype):
-            yield ArgsKwargs(image_loader, threshold=threshold)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.solarize_image_tensor,
-        kernel_name="solarize_image_tensor",
-        sample_inputs_fn=sample_inputs_solarize_image_tensor,
-        reference_fn=pil_reference_wrapper(F.solarize_image_pil),
-        reference_inputs_fn=reference_inputs_solarize_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-def sample_inputs_autocontrast_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader)
-
-
-def reference_inputs_autocontrast_image_tensor():
-    for image_loader in make_image_loaders(
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]
-    ):
-        yield ArgsKwargs(image_loader)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.autocontrast_image_tensor,
-        kernel_name="autocontrast_image_tensor",
-        sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
-        reference_fn=pil_reference_wrapper(F.autocontrast_image_pil),
-        reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-_ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]
-
-
-def sample_inputs_adjust_sharpness_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random", (2, 2)],
-        color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB),
-        # FIXME: kernel should support arbitrary batch sizes
-        extra_dims=[(), (4,)],
-    ):
-        yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
-
-
-def reference_inputs_adjust_sharpness_image_tensor():
-    for image_loader, sharpness_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
-        _ADJUST_SHARPNESS_FACTORS,
-    ):
-        yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_sharpness_image_tensor,
-        kernel_name="adjust_sharpness_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_sharpness_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-def sample_inputs_erase_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
-        # FIXME: make the parameters more diverse
-        h, w = 6, 7
-        v = torch.rand(image_loader.num_channels, h, w)
-        yield ArgsKwargs(image_loader, i=1, j=2, h=h, w=w, v=v)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.erase_image_tensor,
-        kernel_name="erase_image_tensor",
-        sample_inputs_fn=sample_inputs_erase_image_tensor,
-    )
-)
-
-_ADJUST_BRIGHTNESS_FACTORS = [0.1, 0.5]
-
-
-def sample_inputs_adjust_brightness_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
-
-
-def reference_inputs_adjust_brightness_image_tensor():
-    for image_loader, brightness_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
-        _ADJUST_BRIGHTNESS_FACTORS,
-    ):
-        yield ArgsKwargs(image_loader, brightness_factor=brightness_factor)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_brightness_image_tensor,
-        kernel_name="adjust_brightness_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_brightness_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_brightness_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_brightness_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-_ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
-
-
-def sample_inputs_adjust_contrast_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
-
-
-def reference_inputs_adjust_contrast_image_tensor():
-    for image_loader, contrast_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
-        _ADJUST_CONTRAST_FACTORS,
-    ):
-        yield ArgsKwargs(image_loader, contrast_factor=contrast_factor)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_contrast_image_tensor,
-        kernel_name="adjust_contrast_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_contrast_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-_ADJUST_GAMMA_GAMMAS_GAINS = [
-    (0.5, 2.0),
-    (0.0, 1.0),
-]
-
-
-def sample_inputs_adjust_gamma_image_tensor():
-    gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
-
-
-def reference_inputs_adjust_gamma_image_tensor():
-    for image_loader, (gamma, gain) in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
-        _ADJUST_GAMMA_GAMMAS_GAINS,
-    ):
-        yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_gamma_image_tensor,
-        kernel_name="adjust_gamma_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_gamma_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-_ADJUST_HUE_FACTORS = [-0.1, 0.5]
-
-
-def sample_inputs_adjust_hue_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
-
-
-def reference_inputs_adjust_hue_image_tensor():
-    for image_loader, hue_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
-        _ADJUST_HUE_FACTORS,
-    ):
-        yield ArgsKwargs(image_loader, hue_factor=hue_factor)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_hue_image_tensor,
-        kernel_name="adjust_hue_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_hue_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-_ADJUST_SATURATION_FACTORS = [0.1, 0.5]
-
-
-def sample_inputs_adjust_saturation_image_tensor():
-    for image_loader in make_image_loaders(
-        sizes=["random"], color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)
-    ):
-        yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
-
-
-def reference_inputs_adjust_saturation_image_tensor():
-    for image_loader, saturation_factor in itertools.product(
-        make_image_loaders(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB), extra_dims=[()]),
-        _ADJUST_SATURATION_FACTORS,
-    ):
-        yield ArgsKwargs(image_loader, saturation_factor=saturation_factor)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.adjust_saturation_image_tensor,
-        kernel_name="adjust_saturation_image_tensor",
-        sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
-        reference_fn=pil_reference_wrapper(F.adjust_saturation_image_pil),
-        reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
-        closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-    )
-)
-
-
-def sample_inputs_clamp_bounding_box():
-    for bounding_box_loader in make_bounding_box_loaders():
-        yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size
-        )
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.clamp_bounding_box,
-        sample_inputs_fn=sample_inputs_clamp_bounding_box,
-    )
-)
-
-_FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
-
-
-def _get_five_ten_crop_image_size(size):
-    if isinstance(size, int):
-        crop_height = crop_width = size
-    elif len(size) == 1:
-        crop_height = crop_width = size[0]
-    else:
-        crop_height, crop_width = size
-    return 2 * crop_height, 2 * crop_width
-
-
-def sample_inputs_five_crop_image_tensor():
-    for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)]):
-            yield ArgsKwargs(image_loader, size=size)
-
-
-def reference_inputs_five_crop_image_tensor():
-    for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
-            yield ArgsKwargs(image_loader, size=size)
-
-
-def sample_inputs_ten_crop_image_tensor():
-    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)]):
-            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
-
-
-def reference_inputs_ten_crop_image_tensor():
-    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
-            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
-
-
-KERNEL_INFOS.extend(
-    [
-        KernelInfo(
-            F.five_crop_image_tensor,
-            sample_inputs_fn=sample_inputs_five_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.five_crop_image_pil),
-            reference_inputs_fn=reference_inputs_five_crop_image_tensor,
-            skips=[
-                skip_integer_size_jit(),
-                Skip("test_batched_vs_single", reason="Custom batching needed for five_crop_image_tensor."),
-            ],
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-        KernelInfo(
-            F.ten_crop_image_tensor,
-            sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
-            reference_fn=pil_reference_wrapper(F.ten_crop_image_pil),
-            reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
-            skips=[
-                skip_integer_size_jit(),
-                Skip("test_batched_vs_single", reason="Custom batching needed for ten_crop_image_tensor."),
-            ],
-            closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
-        ),
-    ]
-)
-
-_NORMALIZE_MEANS_STDS = [
-    ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
-    ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]),
-]
-
-
-def sample_inputs_normalize_image_tensor():
-    for image_loader, (mean, std) in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]),
-        _NORMALIZE_MEANS_STDS,
-    ):
-        yield ArgsKwargs(image_loader, mean=mean, std=std)
-
-
-KERNEL_INFOS.append(
-    KernelInfo(
-        F.normalize_image_tensor,
-        kernel_name="normalize_image_tensor",
-        sample_inputs_fn=sample_inputs_normalize_image_tensor,
-    )
-)
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
deleted file mode 100644
index 283a30a3d85..00000000000
--- a/test/test_prototype_datasets_builtin.py
+++ /dev/null
@@ -1,220 +0,0 @@
-import functools
-import io
-import pickle
-from pathlib import Path
-
-import pytest
-import torch
-from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks
-from torch.testing._comparison import assert_equal, ObjectPair, TensorLikePair
-from torch.utils.data import DataLoader
-from torch.utils.data.graph import traverse_dps
-from torch.utils.data.graph_settings import get_all_graph_pipes
-from torchdata.datapipes.iter import ShardingFilter, Shuffler
-from torchvision._utils import sequence_to_str
-from torchvision.prototype import datasets, transforms
-from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.prototype.features import Image, Label
-
-assert_samples_equal = functools.partial(
-    assert_equal, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True
-)
-
-
-def extract_datapipes(dp):
-    return get_all_graph_pipes(traverse_dps(dp))
-
-
-@pytest.fixture(autouse=True)
-def test_home(mocker, tmp_path):
-    mocker.patch("torchvision.prototype.datasets._api.home", return_value=str(tmp_path))
-    mocker.patch("torchvision.prototype.datasets.home", return_value=str(tmp_path))
-    yield tmp_path
-
-
-def test_coverage():
-    untested_datasets = set(datasets.list_datasets()) - DATASET_MOCKS.keys()
-    if untested_datasets:
-        raise AssertionError(
-            f"The dataset(s) {sequence_to_str(sorted(untested_datasets), separate_last='and ')} "
-            f"are exposed through `torchvision.prototype.datasets.load()`, but are not tested. "
-            f"Please add mock data to `test/builtin_dataset_mocks.py`."
-        )
-
-
-@pytest.mark.filterwarnings("error")
-class TestCommon:
-    @pytest.mark.parametrize("name", datasets.list_datasets())
-    def test_info(self, name):
-        try:
-            info = datasets.info(name)
-        except ValueError:
-            raise AssertionError("No info available.") from None
-
-        if not (isinstance(info, dict) and all(isinstance(key, str) for key in info.keys())):
-            raise AssertionError("Info should be a dictionary with string keys.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_smoke(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        if not isinstance(dataset, datasets.utils.Dataset):
-            raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_sample(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        try:
-            sample = next(iter(dataset))
-        except StopIteration:
-            raise AssertionError("Unable to draw any sample.") from None
-        except Exception as error:
-            raise AssertionError("Drawing a sample raised the error above.") from error
-
-        if not isinstance(sample, dict):
-            raise AssertionError(f"Samples should be dictionaries, but got {type(sample)} instead.")
-
-        if not sample:
-            raise AssertionError("Sample dictionary is empty.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_num_samples(self, dataset_mock, config):
-        dataset, mock_info = dataset_mock.load(config)
-
-        assert len(list(dataset)) == mock_info["num_samples"]
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_vanilla_tensors(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
-        if vanilla_tensors:
-            raise AssertionError(
-                f"The values of key(s) "
-                f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors."
-            )
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_transformable(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        next(iter(dataset.map(transforms.Identity())))
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_traversable(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        traverse_dps(dataset)
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_serializable(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        pickle.dumps(dataset)
-
-    # This has to be a proper function, since lambda's or local functions
-    # cannot be pickled, but this is a requirement for the DataLoader with
-    # multiprocessing, i.e. num_workers > 0
-    def _collate_fn(self, batch):
-        return batch
-
-    @pytest.mark.parametrize("num_workers", [0, 1])
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_data_loader(self, dataset_mock, config, num_workers):
-        dataset, _ = dataset_mock.load(config)
-
-        dl = DataLoader(
-            dataset,
-            batch_size=2,
-            num_workers=num_workers,
-            collate_fn=self._collate_fn,
-        )
-
-        next(iter(dl))
-
-    # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
-    #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680
-    #  contain a custom test for that, but we opted to wait for a potential solution / test from torchdata for now.
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
-    def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
-        dataset, _ = dataset_mock.load(config)
-
-        if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
-            raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_save_load(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        sample = next(iter(dataset))
-
-        with io.BytesIO() as buffer:
-            torch.save(sample, buffer)
-            buffer.seek(0)
-            assert_samples_equal(torch.load(buffer), sample)
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_infinite_buffer_size(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        for dp in extract_datapipes(dataset):
-            if hasattr(dp, "buffer_size"):
-                # TODO: replace this with the proper sentinel as soon as https://github.com/pytorch/data/issues/335 is
-                #  resolved
-                assert dp.buffer_size == INFINITE_BUFFER_SIZE
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_has_length(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        assert len(dataset) > 0
-
-
-@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
-class TestQMNIST:
-    def test_extra_label(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        sample = next(iter(dataset))
-        for key, type in (
-            ("nist_hsf_series", int),
-            ("nist_writer_id", int),
-            ("digit_index", int),
-            ("nist_label", int),
-            ("global_digit_index", int),
-            ("duplicate", bool),
-            ("unused", bool),
-        ):
-            assert key in sample and isinstance(sample[key], type)
-
-
-@parametrize_dataset_mocks(DATASET_MOCKS["gtsrb"])
-class TestGTSRB:
-    def test_label_matches_path(self, dataset_mock, config):
-        # We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
-        # This test makes sure that they're both the same
-        if config["split"] != "train":
-            return
-
-        dataset, _ = dataset_mock.load(config)
-
-        for sample in dataset:
-            label_from_path = int(Path(sample["path"]).parent.name)
-            assert sample["label"] == label_from_path
-
-
-@parametrize_dataset_mocks(DATASET_MOCKS["usps"])
-class TestUSPS:
-    def test_sample_content(self, dataset_mock, config):
-        dataset, _ = dataset_mock.load(config)
-
-        for sample in dataset:
-            assert "image" in sample
-            assert "label" in sample
-
-            assert isinstance(sample["image"], Image)
-            assert isinstance(sample["label"], Label)
-
-            assert sample["image"].shape == (1, 16, 16)
diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
deleted file mode 100644
index 2098ac736ac..00000000000
--- a/test/test_prototype_datasets_utils.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import gzip
-import pathlib
-import sys
-
-import numpy as np
-import pytest
-import torch
-from datasets_utils import make_fake_flo_file, make_tar
-from torchdata.datapipes.iter import FileOpener, TarArchiveLoader
-from torchvision.datasets._optical_flow import _read_flo as read_flo_ref
-from torchvision.datasets.utils import _decompress
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import fromfile, read_flo
-
-
-@pytest.mark.filterwarnings("error:The given NumPy array is not writeable:UserWarning")
-@pytest.mark.parametrize(
-    ("np_dtype", "torch_dtype", "byte_order"),
-    [
-        (">f4", torch.float32, "big"),
-        ("<f8", torch.float64, "little"),
-        ("<i4", torch.int32, "little"),
-        (">i8", torch.int64, "big"),
-        ("|u1", torch.uint8, sys.byteorder),
-    ],
-)
-@pytest.mark.parametrize("count", (-1, 2))
-@pytest.mark.parametrize("mode", ("rb", "r+b"))
-def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, count, mode):
-    path = tmpdir / "data.bin"
-    rng = np.random.RandomState(0)
-    rng.randn(5 if count == -1 else count + 1).astype(np_dtype).tofile(path)
-
-    for count_ in (-1, count // 2):
-        expected = torch.from_numpy(np.fromfile(path, dtype=np_dtype, count=count_).astype(np_dtype[1:]))
-
-        with open(path, mode) as file:
-            actual = fromfile(file, dtype=torch_dtype, byte_order=byte_order, count=count_)
-
-        torch.testing.assert_close(actual, expected)
-
-
-def test_read_flo(tmpdir):
-    path = tmpdir / "test.flo"
-    make_fake_flo_file(3, 4, path)
-
-    with open(path, "rb") as file:
-        actual = read_flo(file)
-
-    expected = torch.from_numpy(read_flo_ref(path).astype("f4", copy=False))
-
-    torch.testing.assert_close(actual, expected)
-
-
-class TestOnlineResource:
-    class DummyResource(OnlineResource):
-        def __init__(self, download_fn=None, **kwargs):
-            super().__init__(**kwargs)
-            self._download_fn = download_fn
-
-        def _download(self, root):
-            if self._download_fn is None:
-                raise pytest.UsageError(
-                    "`_download()` was called, but `DummyResource(...)` was constructed without `download_fn`."
-                )
-
-            return self._download_fn(self, root)
-
-    def _make_file(self, root, *, content, name="file.txt"):
-        file = root / name
-        with open(file, "w") as fh:
-            fh.write(content)
-
-        return file
-
-    def _make_folder(self, root, *, name="folder"):
-        folder = root / name
-        subfolder = folder / "subfolder"
-        subfolder.mkdir(parents=True)
-
-        files = {}
-        for idx, root in enumerate([folder, folder, subfolder]):
-            content = f"sentinel{idx}"
-            file = self._make_file(root, name=f"file{idx}.txt", content=content)
-            files[str(file)] = content
-
-        return folder, files
-
-    def _make_tar(self, root, *, name="archive.tar", remove=True):
-        folder, files = self._make_folder(root, name=name.split(".")[0])
-        archive = make_tar(root, name, folder, remove=remove)
-        files = {str(archive / pathlib.Path(file).relative_to(root)): content for file, content in files.items()}
-        return archive, files
-
-    def test_load_file(self, tmp_path):
-        content = "sentinel"
-        file = self._make_file(tmp_path, content=content)
-
-        resource = self.DummyResource(file_name=file.name)
-
-        dp = resource.load(tmp_path)
-        assert isinstance(dp, FileOpener)
-
-        data = list(dp)
-        assert len(data) == 1
-
-        path, buffer = data[0]
-        assert path == str(file)
-        assert buffer.read().decode() == content
-
-    def test_load_folder(self, tmp_path):
-        folder, files = self._make_folder(tmp_path)
-
-        resource = self.DummyResource(file_name=folder.name)
-
-        dp = resource.load(tmp_path)
-        assert isinstance(dp, FileOpener)
-        assert {path: buffer.read().decode() for path, buffer in dp} == files
-
-    def test_load_archive(self, tmp_path):
-        archive, files = self._make_tar(tmp_path)
-
-        resource = self.DummyResource(file_name=archive.name)
-
-        dp = resource.load(tmp_path)
-        assert isinstance(dp, TarArchiveLoader)
-        assert {path: buffer.read().decode() for path, buffer in dp} == files
-
-    def test_priority_decompressed_gt_raw(self, tmp_path):
-        # We don't need to actually compress here. Adding the suffix is sufficient
-        self._make_file(tmp_path, content="raw_sentinel", name="file.txt.gz")
-        file = self._make_file(tmp_path, content="decompressed_sentinel", name="file.txt")
-
-        resource = self.DummyResource(file_name=file.name)
-
-        dp = resource.load(tmp_path)
-        path, buffer = next(iter(dp))
-
-        assert path == str(file)
-        assert buffer.read().decode() == "decompressed_sentinel"
-
-    def test_priority_extracted_gt_decompressed(self, tmp_path):
-        archive, _ = self._make_tar(tmp_path, remove=False)
-
-        resource = self.DummyResource(file_name=archive.name)
-
-        dp = resource.load(tmp_path)
-        # If the archive had been selected, this would be a `TarArchiveReader`
-        assert isinstance(dp, FileOpener)
-
-    def test_download(self, tmp_path):
-        download_fn_was_called = False
-
-        def download_fn(resource, root):
-            nonlocal download_fn_was_called
-            download_fn_was_called = True
-
-            return self._make_file(root, content="_", name=resource.file_name)
-
-        resource = self.DummyResource(
-            file_name="file.txt",
-            download_fn=download_fn,
-        )
-
-        resource.load(tmp_path)
-
-        assert download_fn_was_called, "`download_fn()` was never called"
-
-    # This tests the `"decompress"` literal as well as a custom callable
-    @pytest.mark.parametrize(
-        "preprocess",
-        [
-            "decompress",
-            lambda path: _decompress(str(path), remove_finished=True),
-        ],
-    )
-    def test_preprocess_decompress(self, tmp_path, preprocess):
-        file_name = "file.txt.gz"
-        content = "sentinel"
-
-        def download_fn(resource, root):
-            file = root / resource.file_name
-            with gzip.open(file, "wb") as fh:
-                fh.write(content.encode())
-            return file
-
-        resource = self.DummyResource(file_name=file_name, preprocess=preprocess, download_fn=download_fn)
-
-        dp = resource.load(tmp_path)
-        data = list(dp)
-        assert len(data) == 1
-
-        path, buffer = data[0]
-        assert path == str(tmp_path / file_name).replace(".gz", "")
-        assert buffer.read().decode() == content
-
-    def test_preprocess_extract(self, tmp_path):
-        files = None
-
-        def download_fn(resource, root):
-            nonlocal files
-            archive, files = self._make_tar(root, name=resource.file_name)
-            return archive
-
-        resource = self.DummyResource(file_name="folder.tar", preprocess="extract", download_fn=download_fn)
-
-        dp = resource.load(tmp_path)
-        assert files is not None, "`download_fn()` was never called"
-        assert isinstance(dp, FileOpener)
-
-        actual = {path: buffer.read().decode() for path, buffer in dp}
-        expected = {
-            path.replace(resource.file_name, resource.file_name.split(".")[0]): content
-            for path, content in files.items()
-        }
-        assert actual == expected
-
-    def test_preprocess_only_after_download(self, tmp_path):
-        file = self._make_file(tmp_path, content="_")
-
-        def preprocess(path):
-            raise AssertionError("`preprocess` was called although the file was already present.")
-
-        resource = self.DummyResource(
-            file_name=file.name,
-            preprocess=preprocess,
-        )
-
-        resource.load(tmp_path)
-
-
-class TestHttpResource:
-    def test_resolve_to_http(self, mocker):
-        file_name = "data.tar"
-        original_url = f"http://downloads.pytorch.org/{file_name}"
-
-        redirected_url = original_url.replace("http", "https")
-
-        sha256_sentinel = "sha256_sentinel"
-
-        def preprocess_sentinel(path):
-            return path
-
-        original_resource = HttpResource(
-            original_url,
-            sha256=sha256_sentinel,
-            preprocess=preprocess_sentinel,
-        )
-
-        mocker.patch("torchvision.prototype.datasets.utils._resource._get_redirect_url", return_value=redirected_url)
-        redirected_resource = original_resource.resolve()
-
-        assert isinstance(redirected_resource, HttpResource)
-        assert redirected_resource.url == redirected_url
-        assert redirected_resource.file_name == file_name
-        assert redirected_resource.sha256 == sha256_sentinel
-        assert redirected_resource._preprocess is preprocess_sentinel
-
-    def test_resolve_to_gdrive(self, mocker):
-        file_name = "data.tar"
-        original_url = f"http://downloads.pytorch.org/{file_name}"
-
-        id_sentinel = "id-sentinel"
-        redirected_url = f"https://drive.google.com/file/d/{id_sentinel}/view"
-
-        sha256_sentinel = "sha256_sentinel"
-
-        def preprocess_sentinel(path):
-            return path
-
-        original_resource = HttpResource(
-            original_url,
-            sha256=sha256_sentinel,
-            preprocess=preprocess_sentinel,
-        )
-
-        mocker.patch("torchvision.prototype.datasets.utils._resource._get_redirect_url", return_value=redirected_url)
-        redirected_resource = original_resource.resolve()
-
-        assert isinstance(redirected_resource, GDriveResource)
-        assert redirected_resource.id == id_sentinel
-        assert redirected_resource.file_name == file_name
-        assert redirected_resource.sha256 == sha256_sentinel
-        assert redirected_resource._preprocess is preprocess_sentinel
-
-
-def test_missing_dependency_error():
-    class DummyDataset(Dataset):
-        def __init__(self):
-            super().__init__(root="root", dependencies=("fake_dependency",))
-
-        def _resources(self):
-            pass
-
-        def _datapipe(self, resource_dps):
-            pass
-
-        def __len__(self):
-            pass
-
-    with pytest.raises(ModuleNotFoundError, match="depends on the third-party package 'fake_dependency'"):
-        DummyDataset()
diff --git a/test/test_prototype_features.py b/test/test_prototype_features.py
deleted file mode 100644
index 2701dd66be0..00000000000
--- a/test/test_prototype_features.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pytest
-import torch
-from torchvision.prototype import features
-
-
-def test_isinstance():
-    assert isinstance(
-        features.Label([0, 1, 0], categories=["foo", "bar"]),
-        torch.Tensor,
-    )
-
-
-def test_wrapping_no_copy():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    assert label.data_ptr() == tensor.data_ptr()
-
-
-def test_to_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    label_to = label.to(torch.int32)
-
-    assert type(label_to) is features.Label
-    assert label_to.dtype is torch.int32
-    assert label_to.categories is label.categories
-
-
-def test_to_feature_reference():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"]).to(torch.int32)
-
-    tensor_to = tensor.to(label)
-
-    assert type(tensor_to) is torch.Tensor
-    assert tensor_to.dtype is torch.int32
-
-
-def test_clone_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    label_clone = label.clone()
-
-    assert type(label_clone) is features.Label
-    assert label_clone.data_ptr() != label.data_ptr()
-    assert label_clone.categories is label.categories
-
-
-def test_requires_grad__wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.float32)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    assert not label.requires_grad
-
-    label_requires_grad = label.requires_grad_(True)
-
-    assert type(label_requires_grad) is features.Label
-    assert label.requires_grad
-    assert label_requires_grad.requires_grad
-
-
-def test_other_op_no_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    # any operation besides .to() and .clone() will do here
-    output = label * 2
-
-    assert type(output) is torch.Tensor
-
-
-@pytest.mark.parametrize(
-    "op",
-    [
-        lambda t: t.numpy(),
-        lambda t: t.tolist(),
-        lambda t: t.max(dim=-1),
-    ],
-)
-def test_no_tensor_output_op_no_wrapping(op):
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    output = op(label)
-
-    assert type(output) is not features.Label
-
-
-def test_inplace_op_no_wrapping():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    output = label.add_(0)
-
-    assert type(output) is torch.Tensor
-    assert type(label) is features.Label
-
-
-def test_new_like():
-    tensor = torch.tensor([0, 1, 0], dtype=torch.int64)
-    label = features.Label(tensor, categories=["foo", "bar"])
-
-    # any operation besides .to() and .clone() will do here
-    output = label * 2
-
-    label_new = features.Label.new_like(label, output)
-
-    assert type(label_new) is features.Label
-    assert label_new.data_ptr() == output.data_ptr()
-    assert label_new.categories is label.categories
diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
deleted file mode 100644
index 6d9f22c1543..00000000000
--- a/test/test_prototype_models.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import pytest
-import test_models as TM
-import torch
-from common_utils import cpu_and_gpu, set_rng_seed
-from torchvision.prototype import models
-
-
-@pytest.mark.parametrize("model_fn", (models.depth.stereo.raft_stereo_base,))
-@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
-def test_raft_stereo(model_fn, model_mode, dev):
-    # A simple test to make sure the model can do forward pass and jit scriptable
-    set_rng_seed(0)
-
-    # Use corr_pyramid and corr_block with smaller num_levels and radius to prevent nan output
-    # get the idea from test_models.test_raft
-    corr_pyramid = models.depth.stereo.raft_stereo.CorrPyramid1d(num_levels=2)
-    corr_block = models.depth.stereo.raft_stereo.CorrBlock1d(num_levels=2, radius=2)
-    model = model_fn(corr_pyramid=corr_pyramid, corr_block=corr_block).eval().to(dev)
-
-    if model_mode == "scripted":
-        model = torch.jit.script(model)
-
-    img1 = torch.rand(1, 3, 64, 64).to(dev)
-    img2 = torch.rand(1, 3, 64, 64).to(dev)
-    num_iters = 3
-
-    preds = model(img1, img2, num_iters=num_iters)
-    depth_pred = preds[-1]
-
-    assert len(preds) == num_iters, "Number of predictions should be the same as model.num_iters"
-
-    assert depth_pred.shape == torch.Size(
-        [1, 1, 64, 64]
-    ), f"The output shape of depth_pred should be [1, 1, 64, 64] but instead it is {preds[0].shape}"
-
-    # Test against expected file output
-    TM._assert_expected(depth_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
-
-
-@pytest.mark.parametrize("model_fn", (models.depth.stereo.crestereo_base,))
-@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
-def test_crestereo(model_fn, model_mode, dev):
-    set_rng_seed(0)
-
-    model = model_fn().eval().to(dev)
-
-    if model_mode == "scripted":
-        model = torch.jit.script(model)
-
-    img1 = torch.rand(1, 3, 64, 64).to(dev)
-    img2 = torch.rand(1, 3, 64, 64).to(dev)
-    iterations = 3
-
-    preds = model(img1, img2, flow_init=None, num_iters=iterations)
-    disparity_pred = preds[-1]
-
-    # all the pyramid levels except the highest res make only half the number of iterations
-    expected_iterations = (iterations // 2) * (len(model.resolutions) - 1)
-    expected_iterations += iterations
-    assert (
-        len(preds) == expected_iterations
-    ), "Number of predictions should be the number of iterations multiplied by the number of pyramid levels"
-
-    assert disparity_pred.shape == torch.Size(
-        [1, 2, 64, 64]
-    ), f"Predicted disparity should have the same spatial shape as the input. Inputs shape {img1.shape[2:]}, Prediction shape {disparity_pred.shape[2:]}"
-
-    assert all(
-        d.shape == torch.Size([1, 2, 64, 64]) for d in preds
-    ), "All predicted disparities are expected to have the same shape"
-
-    # test a backward pass with a dummy loss as well
-    preds = torch.stack(preds, dim=0)
-    targets = torch.ones_like(preds, requires_grad=False)
-    loss = torch.nn.functional.mse_loss(preds, targets)
-
-    try:
-        loss.backward()
-    except Exception as e:
-        assert False, f"Backward pass failed with an unexpected exception: {e.__class__.__name__} {e}"
-
-    TM._assert_expected(disparity_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
deleted file mode 100644
index 9734a5dc30a..00000000000
--- a/test/test_prototype_transforms.py
+++ /dev/null
@@ -1,1780 +0,0 @@
-import itertools
-
-import numpy as np
-
-import PIL.Image
-
-import pytest
-import torch
-from common_utils import assert_equal, cpu_and_gpu
-from prototype_common_utils import (
-    make_bounding_box,
-    make_bounding_boxes,
-    make_detection_mask,
-    make_image,
-    make_images,
-    make_label,
-    make_masks,
-    make_one_hot_labels,
-    make_segmentation_mask,
-)
-from torchvision.ops.boxes import box_iou
-from torchvision.prototype import features, transforms
-from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image
-
-
-def make_vanilla_tensor_images(*args, **kwargs):
-    for image in make_images(*args, **kwargs):
-        if image.ndim > 3:
-            continue
-        yield image.data
-
-
-def make_pil_images(*args, **kwargs):
-    for image in make_vanilla_tensor_images(*args, **kwargs):
-        yield to_pil_image(image)
-
-
-def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_box in make_bounding_boxes(*args, **kwargs):
-        yield bounding_box.data
-
-
-def parametrize(transforms_with_inputs):
-    return pytest.mark.parametrize(
-        ("transform", "input"),
-        [
-            pytest.param(
-                transform,
-                input,
-                id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}",
-            )
-            for transform, inputs in transforms_with_inputs
-            for idx, input in enumerate(inputs)
-        ],
-    )
-
-
-def parametrize_from_transforms(*transforms):
-    transforms_with_inputs = []
-    for transform in transforms:
-        for creation_fn in [
-            make_images,
-            make_bounding_boxes,
-            make_one_hot_labels,
-            make_vanilla_tensor_images,
-            make_pil_images,
-            make_masks,
-        ]:
-            inputs = list(creation_fn())
-            try:
-                output = transform(inputs[0])
-            except Exception:
-                continue
-            else:
-                if output is inputs[0]:
-                    continue
-
-            transforms_with_inputs.append((transform, inputs))
-
-    return parametrize(transforms_with_inputs)
-
-
-class TestSmoke:
-    @parametrize_from_transforms(
-        transforms.RandomErasing(p=1.0),
-        transforms.Resize([16, 16]),
-        transforms.CenterCrop([16, 16]),
-        transforms.ConvertImageDtype(),
-        transforms.RandomHorizontalFlip(),
-        transforms.Pad(5),
-        transforms.RandomZoomOut(),
-        transforms.RandomRotation(degrees=(-45, 45)),
-        transforms.RandomAffine(degrees=(-45, 45)),
-        transforms.RandomCrop([16, 16], padding=1, pad_if_needed=True),
-        # TODO: Something wrong with input data setup. Let's fix that
-        # transforms.RandomEqualize(),
-        # transforms.RandomInvert(),
-        # transforms.RandomPosterize(bits=4),
-        # transforms.RandomSolarize(threshold=0.5),
-        # transforms.RandomAdjustSharpness(sharpness_factor=0.5),
-    )
-    def test_common(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transform,
-                [
-                    dict(
-                        image=features.Image.new_like(image, image.unsqueeze(0), dtype=torch.float),
-                        one_hot_label=features.OneHotLabel.new_like(
-                            one_hot_label, one_hot_label.unsqueeze(0), dtype=torch.float
-                        ),
-                    )
-                    for image, one_hot_label in itertools.product(make_images(), make_one_hot_labels())
-                ],
-            )
-            for transform in [
-                transforms.RandomMixup(alpha=1.0),
-                transforms.RandomCutmix(alpha=1.0),
-            ]
-        ]
-    )
-    def test_mixup_cutmix(self, transform, input):
-        transform(input)
-
-        # add other data that should bypass and wont raise any error
-        input_copy = dict(input)
-        input_copy["path"] = "/path/to/somewhere"
-        input_copy["num"] = 1234
-        transform(input_copy)
-
-        # Check if we raise an error if sample contains bbox or mask or label
-        err_msg = "does not support PIL images, bounding boxes, masks and plain labels"
-        input_copy = dict(input)
-        for unsup_data in [
-            make_label(),
-            make_bounding_box(format="XYXY"),
-            make_detection_mask(),
-            make_segmentation_mask(),
-        ]:
-            input_copy["unsupported"] = unsup_data
-            with pytest.raises(TypeError, match=err_msg):
-                transform(input_copy)
-
-    @parametrize(
-        [
-            (
-                transform,
-                itertools.chain.from_iterable(
-                    fn(
-                        color_spaces=[
-                            features.ColorSpace.GRAY,
-                            features.ColorSpace.RGB,
-                        ],
-                        dtypes=[torch.uint8],
-                        extra_dims=[(4,)],
-                    )
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                        make_pil_images,
-                    ]
-                ),
-            )
-            for transform in (
-                transforms.RandAugment(),
-                transforms.TrivialAugmentWide(),
-                transforms.AutoAugment(),
-                transforms.AugMix(),
-            )
-        ]
-    )
-    def test_auto_augment(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
-                itertools.chain.from_iterable(
-                    fn(color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32])
-                    for fn in [
-                        make_images,
-                        make_vanilla_tensor_images,
-                    ]
-                ),
-            ),
-        ]
-    )
-    def test_normalize(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.RandomResizedCrop([16, 16]),
-                itertools.chain(
-                    make_images(extra_dims=[(4,)]),
-                    make_vanilla_tensor_images(),
-                    make_pil_images(),
-                ),
-            )
-        ]
-    )
-    def test_random_resized_crop(self, transform, input):
-        transform(input)
-
-    @parametrize(
-        [
-            (
-                transforms.ConvertColorSpace(color_space=new_color_space, old_color_space=old_color_space),
-                itertools.chain.from_iterable(
-                    [
-                        fn(color_spaces=[old_color_space])
-                        for fn in (
-                            make_images,
-                            make_vanilla_tensor_images,
-                            make_pil_images,
-                        )
-                    ]
-                ),
-            )
-            for old_color_space, new_color_space in itertools.product(
-                [
-                    features.ColorSpace.GRAY,
-                    features.ColorSpace.GRAY_ALPHA,
-                    features.ColorSpace.RGB,
-                    features.ColorSpace.RGB_ALPHA,
-                ],
-                repeat=2,
-            )
-        ]
-    )
-    def test_convert_color_space(self, transform, input):
-        transform(input)
-
-    def test_convert_color_space_unsupported_types(self):
-        transform = transforms.ConvertColorSpace(
-            color_space=features.ColorSpace.RGB, old_color_space=features.ColorSpace.GRAY
-        )
-
-        for inpt in [make_bounding_box(format="XYXY"), make_masks()]:
-            output = transform(inpt)
-            assert output is inpt
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomHorizontalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
-        expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_features_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(features.Image(input))
-
-        assert_equal(features.Image(expected), actual)
-
-    def test_features_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(features.Mask(input))
-
-        assert_equal(features.Mask(expected), actual)
-
-    def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
-        transform = transforms.RandomHorizontalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
-        expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
-
-
-@pytest.mark.parametrize("p", [0.0, 1.0])
-class TestRandomVerticalFlip:
-    def input_expected_image_tensor(self, p, dtype=torch.float32):
-        input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype)
-        expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype)
-
-        return input, expected if p == 1 else input
-
-    def test_simple_tensor(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        assert_equal(expected, actual)
-
-    def test_pil_image(self, p):
-        input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(to_pil_image(input))
-
-        assert_equal(expected, pil_to_tensor(actual))
-
-    def test_features_image(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(features.Image(input))
-
-        assert_equal(features.Image(expected), actual)
-
-    def test_features_mask(self, p):
-        input, expected = self.input_expected_image_tensor(p)
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(features.Mask(input))
-
-        assert_equal(features.Mask(expected), actual)
-
-    def test_features_bounding_box(self, p):
-        input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
-        transform = transforms.RandomVerticalFlip(p=p)
-
-        actual = transform(input)
-
-        expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input
-        expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
-        assert_equal(expected, actual)
-        assert actual.format == expected.format
-        assert actual.image_size == expected.image_size
-
-
-class TestPad:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.Pad("abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.Pad([-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.Pad(12, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.Pad(12, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, fill, padding_mode, mocker):
-        transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        inpt = mocker.MagicMock(spec=features.Image)
-        _ = transform(inpt)
-
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
-        if isinstance(padding, tuple):
-            padding = list(padding)
-        fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
-
-    @pytest.mark.parametrize("fill", [12, {features.Image: 12, features.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.Pad(1, fill=fill, padding_mode="constant")
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        image = features.Image(torch.rand(3, 32, 32))
-        mask = features.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-        _ = transform(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms.functional._geometry._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, padding=1, fill=fill, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill, padding_mode="constant"),
-            ]
-        else:
-            fill_img = transforms.functional._geometry._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms.functional._geometry._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, padding=1, fill=fill_img, padding_mode="constant"),
-                mocker.call(mask, padding=1, fill=fill_mask, padding_mode="constant"),
-            ]
-        fn.assert_has_calls(calls)
-
-
-class TestRandomZoomOut:
-    def test_assertions(self):
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomZoomOut(fill="abc")
-
-        with pytest.raises(TypeError, match="should be a sequence of length"):
-            transforms.RandomZoomOut(0, side_range=0)
-
-        with pytest.raises(ValueError, match="Invalid canvas side range"):
-            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__get_params(self, fill, side_range, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
-
-        image = mocker.MagicMock(spec=features.Image)
-        h, w = image.image_size = (24, 32)
-
-        params = transform._get_params(image)
-
-        assert len(params["padding"]) == 4
-        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
-        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
-        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
-
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill)
-
-    @pytest.mark.parametrize("fill", [12, {features.Image: 12, features.Mask: 34}])
-    def test__transform_image_mask(self, fill, mocker):
-        transform = transforms.RandomZoomOut(fill=fill, p=1.0)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.pad")
-        image = features.Image(torch.rand(3, 32, 32))
-        mask = features.Mask(torch.randint(0, 5, size=(32, 32)))
-        inpt = [image, mask]
-
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        if isinstance(fill, int):
-            fill = transforms.functional._geometry._convert_fill_arg(fill)
-            calls = [
-                mocker.call(image, **params, fill=fill),
-                mocker.call(mask, **params, fill=fill),
-            ]
-        else:
-            fill_img = transforms.functional._geometry._convert_fill_arg(fill[type(image)])
-            fill_mask = transforms.functional._geometry._convert_fill_arg(fill[type(mask)])
-            calls = [
-                mocker.call(image, **params, fill=fill_img),
-                mocker.call(mask, **params, fill=fill_mask),
-            ]
-        fn.assert_has_calls(calls)
-
-
-class TestRandomRotation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomRotation(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomRotation(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomRotation(12, fill="abc")
-
-        with pytest.raises(TypeError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=12)
-
-        with pytest.raises(ValueError, match="center should be a sequence of length"):
-            transforms.RandomRotation(12, center=[1, 2, 3])
-
-    def test__get_params(self):
-        angle_bound = 34
-        transform = transforms.RandomRotation(angle_bound)
-
-        params = transform._get_params(None)
-        assert -angle_bound <= params["angle"] <= angle_bound
-
-        angle_bounds = [12, 34]
-        transform = transforms.RandomRotation(angle_bounds)
-
-        params = transform._get_params(None)
-        assert angle_bounds[0] <= params["angle"] <= angle_bounds[1]
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("expand", [False, True])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, expand, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomRotation(
-            degrees, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.rotate")
-        inpt = mocker.MagicMock(spec=features.Image)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center)
-
-    @pytest.mark.parametrize("angle", [34, -87])
-    @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_image_size(self, angle, expand):
-        # Specific test for BoundingBox.rotate
-        bbox = features.BoundingBox(
-            torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32)
-        )
-        img = features.Image(torch.rand(1, 3, 32, 32))
-
-        out_img = img.rotate(angle, expand=expand)
-        out_bbox = bbox.rotate(angle, expand=expand)
-
-        assert out_img.image_size == out_bbox.image_size
-
-
-class TestRandomAffine:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(-0.7)
-
-        for d in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="degrees should be a sequence of length 2"):
-                transforms.RandomAffine(d)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomAffine(12, fill="abc")
-
-        for kwargs in [
-            {"center": 12},
-            {"translate": 12},
-            {"scale": 12},
-        ]:
-            with pytest.raises(TypeError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]:
-            with pytest.raises(ValueError, match="should be a sequence of length"):
-                transforms.RandomAffine(12, **kwargs)
-
-        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
-            transforms.RandomAffine(12, translate=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="scale values should be positive"):
-            transforms.RandomAffine(12, scale=[-1.0, 2.0])
-
-        with pytest.raises(ValueError, match="is a single number, it must be positive"):
-            transforms.RandomAffine(12, shear=-10)
-
-        for s in [[-0.7], [-0.7, 0, 0.7]]:
-            with pytest.raises(ValueError, match="shear should be a sequence of length 2"):
-                transforms.RandomAffine(12, shear=s)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    def test__get_params(self, degrees, translate, scale, shear, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-        h, w = image.image_size
-
-        transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear)
-        params = transform._get_params(image)
-
-        if not isinstance(degrees, (list, tuple)):
-            assert -degrees <= params["angle"] <= degrees
-        else:
-            assert degrees[0] <= params["angle"] <= degrees[1]
-
-        if translate is not None:
-            w_max = int(round(translate[0] * w))
-            h_max = int(round(translate[1] * h))
-            assert -w_max <= params["translate"][0] <= w_max
-            assert -h_max <= params["translate"][1] <= h_max
-        else:
-            assert params["translate"] == (0, 0)
-
-        if scale is not None:
-            assert scale[0] <= params["scale"] <= scale[1]
-        else:
-            assert params["scale"] == 1.0
-
-        if shear is not None:
-            if isinstance(shear, float):
-                assert -shear <= params["shear"][0] <= shear
-                assert params["shear"][1] == 0.0
-            elif len(shear) == 2:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert params["shear"][1] == 0.0
-            else:
-                assert shear[0] <= params["shear"][0] <= shear[1]
-                assert shear[2] <= params["shear"][1] <= shear[3]
-        else:
-            assert params["shear"] == (0, 0)
-
-    @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)])
-    @pytest.mark.parametrize("translate", [None, [0.1, 0.2]])
-    @pytest.mark.parametrize("scale", [None, [0.7, 1.2]])
-    @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]])
-    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
-    @pytest.mark.parametrize("center", [None, [2.0, 3.0]])
-    def test__transform(self, degrees, translate, scale, shear, fill, center, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        transform = transforms.RandomAffine(
-            degrees,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-        if isinstance(degrees, (tuple, list)):
-            assert transform.degrees == [float(degrees[0]), float(degrees[1])]
-        else:
-            assert transform.degrees == [float(-degrees), float(degrees)]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.affine")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center)
-
-
-class TestRandomCrop:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Please provide only two dimensions"):
-            transforms.RandomCrop([10, 12, 14])
-
-        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
-            transforms.RandomCrop([10, 12], padding="abc")
-
-        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
-            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomCrop([10, 12], padding=1, fill="abc")
-
-        with pytest.raises(ValueError, match="Padding mode should be either"):
-            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-        h, w = image.image_size
-
-        transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
-        params = transform._get_params(image)
-
-        if padding is not None:
-            if isinstance(padding, int):
-                pad_top = pad_bottom = pad_left = pad_right = padding
-            elif isinstance(padding, list) and len(padding) == 2:
-                pad_left = pad_right = padding[0]
-                pad_top = pad_bottom = padding[1]
-            elif isinstance(padding, list) and len(padding) == 4:
-                pad_left, pad_top, pad_right, pad_bottom = padding
-
-            h += pad_top + pad_bottom
-            w += pad_left + pad_right
-        else:
-            pad_left = pad_right = pad_top = pad_bottom = 0
-
-        if pad_if_needed:
-            if w < size[1]:
-                diff = size[1] - w
-                pad_left += diff
-                pad_right += diff
-                w += 2 * diff
-            if h < size[0]:
-                diff = size[0] - h
-                pad_top += diff
-                pad_bottom += diff
-                h += 2 * diff
-
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-
-        assert 0 <= params["top"] <= h - size[0] + 1
-        assert 0 <= params["left"] <= w - size[1] + 1
-        assert params["height"] == size[0]
-        assert params["width"] == size[1]
-        assert params["needs_pad"] is any(padding)
-        assert params["padding"] == padding
-
-    @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
-    @pytest.mark.parametrize("pad_if_needed", [False, True])
-    @pytest.mark.parametrize("fill", [False, True])
-    @pytest.mark.parametrize("padding_mode", ["constant", "edge"])
-    def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
-        output_size = [10, 12]
-        transform = transforms.RandomCrop(
-            output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
-        )
-
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (32, 32)
-
-        expected = mocker.MagicMock(spec=features.Image)
-        expected.num_channels = 3
-        if isinstance(padding, int):
-            expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding)
-        elif isinstance(padding, list):
-            expected.image_size = (
-                inpt.image_size[0] + sum(padding[0::2]),
-                inpt.image_size[1] + sum(padding[1::2]),
-            )
-        else:
-            expected.image_size = inpt.image_size
-        _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected)
-        fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop")
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-        if padding is None and not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif not pad_if_needed:
-            fn_crop.assert_called_once_with(
-                expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1]
-            )
-        elif padding is None:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-        else:
-            # vfdev-5: I do not know how to mock and test this case
-            pass
-
-
-class TestGaussianBlur:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
-            transforms.GaussianBlur([10, 12, 14])
-
-        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
-            transforms.GaussianBlur(4)
-
-        with pytest.raises(
-            TypeError, match="sigma should be a single int or float or a list/tuple with length 2 floats."
-        ):
-            transforms.GaussianBlur(3, sigma=[1, 2, 3])
-
-        with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"):
-            transforms.GaussianBlur(3, sigma=-1.0)
-
-        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
-            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
-
-    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]])
-    def test__get_params(self, sigma):
-        transform = transforms.GaussianBlur(3, sigma=sigma)
-        params = transform._get_params(None)
-
-        if isinstance(sigma, float):
-            assert params["sigma"][0] == params["sigma"][1] == 10
-        else:
-            assert sigma[0] <= params["sigma"][0] <= sigma[1]
-            assert sigma[0] <= params["sigma"][1] <= sigma[1]
-
-    @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]])
-    def test__transform(self, kernel_size, sigma, mocker):
-        transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)
-
-        if isinstance(kernel_size, (tuple, list)):
-            assert transform.kernel_size == kernel_size
-        else:
-            kernel_size = (kernel_size, kernel_size)
-            assert transform.kernel_size == kernel_size
-
-        if isinstance(sigma, (tuple, list)):
-            assert transform.sigma == sigma
-        else:
-            assert transform.sigma == [sigma, sigma]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        params = transform._get_params(inpt)
-
-        fn.assert_called_once_with(inpt, kernel_size, **params)
-
-
-class TestRandomColorOp:
-    @pytest.mark.parametrize("p", [0.0, 1.0])
-    @pytest.mark.parametrize(
-        "transform_cls, func_op_name, kwargs",
-        [
-            (transforms.RandomEqualize, "equalize", {}),
-            (transforms.RandomInvert, "invert", {}),
-            (transforms.RandomAutocontrast, "autocontrast", {}),
-            (transforms.RandomPosterize, "posterize", {"bits": 4}),
-            (transforms.RandomSolarize, "solarize", {"threshold": 0.5}),
-            (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}),
-        ],
-    )
-    def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker):
-        transform = transform_cls(p=p, **kwargs)
-
-        fn = mocker.patch(f"torchvision.prototype.transforms.functional.{func_op_name}")
-        inpt = mocker.MagicMock(spec=features.Image)
-        _ = transform(inpt)
-        if p > 0.0:
-            fn.assert_called_once_with(inpt, **kwargs)
-        else:
-            assert fn.call_count == 0
-
-
-class TestRandomPerspective:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
-            transforms.RandomPerspective(distortion_scale=-1.0)
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.RandomPerspective(0.5, fill="abc")
-
-    def test__get_params(self, mocker):
-        dscale = 0.5
-        transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        params = transform._get_params(image)
-
-        h, w = image.image_size
-        assert "perspective_coeffs" in params
-        assert len(params["perspective_coeffs"]) == 8
-
-    @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
-    def test__transform(self, distortion_scale, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.perspective")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-        # vfdev-5, Feature Request: let's store params as Transform attribute
-        # This could be also helpful for users
-        # Otherwise, we can mock transform._get_params
-        torch.manual_seed(12)
-        _ = transform(inpt)
-        torch.manual_seed(12)
-        torch.rand(1)  # random apply changes random state
-        params = transform._get_params(inpt)
-
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
-
-class TestElasticTransform:
-    def test_assertions(self):
-
-        with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"):
-            transforms.ElasticTransform({})
-
-        with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"):
-            transforms.ElasticTransform([1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="alpha should be a sequence of floats"):
-            transforms.ElasticTransform([1, 2])
-
-        with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"):
-            transforms.ElasticTransform(1.0, {})
-
-        with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"):
-            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
-
-        with pytest.raises(ValueError, match="sigma should be a sequence of floats"):
-            transforms.ElasticTransform(1.0, [1, 2])
-
-        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
-            transforms.ElasticTransform(1.0, 2.0, fill="abc")
-
-    def test__get_params(self, mocker):
-        alpha = 2.0
-        sigma = 3.0
-        transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        params = transform._get_params(image)
-
-        h, w = image.image_size
-        displacement = params["displacement"]
-        assert displacement.shape == (1, h, w, 2)
-        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
-        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
-
-    @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]])
-    @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]])
-    def test__transform(self, alpha, sigma, mocker):
-        interpolation = InterpolationMode.BILINEAR
-        fill = 12
-        transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation)
-
-        if isinstance(alpha, float):
-            assert transform.alpha == [alpha, alpha]
-        else:
-            assert transform.alpha == alpha
-
-        if isinstance(sigma, float):
-            assert transform.sigma == [sigma, sigma]
-        else:
-            assert transform.sigma == sigma
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.elastic")
-        inpt = mocker.MagicMock(spec=features.Image)
-        inpt.num_channels = 3
-        inpt.image_size = (24, 32)
-
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock()
-        _ = transform(inpt)
-        params = transform._get_params(inpt)
-        fill = transforms.functional._geometry._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
-
-
-class TestRandomErasing:
-    def test_assertions(self, mocker):
-        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
-            transforms.RandomErasing(value={})
-
-        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
-            transforms.RandomErasing(value="abc")
-
-        with pytest.raises(TypeError, match="Scale should be a sequence"):
-            transforms.RandomErasing(scale=123)
-
-        with pytest.raises(TypeError, match="Ratio should be a sequence"):
-            transforms.RandomErasing(ratio=123)
-
-        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
-            transforms.RandomErasing(scale=[-1, 2])
-
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
-
-        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
-            transform._get_params(image)
-
-    @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-
-        transform = transforms.RandomErasing(value=value)
-        params = transform._get_params(image)
-
-        v = params["v"]
-        h, w = params["h"], params["w"]
-        i, j = params["i"], params["j"]
-        assert isinstance(v, torch.Tensor)
-        if value == "random":
-            assert v.shape == (image.num_channels, h, w)
-        elif isinstance(value, (int, float)):
-            assert v.shape == (1, 1, 1)
-        elif isinstance(value, (list, tuple)):
-            assert v.shape == (image.num_channels, 1, 1)
-
-        assert 0 <= i <= image.image_size[0] - h
-        assert 0 <= j <= image.image_size[1] - w
-
-    @pytest.mark.parametrize("p", [0, 1])
-    def test__transform(self, mocker, p):
-        transform = transforms.RandomErasing(p=p)
-        transform._transformed_types = (mocker.MagicMock,)
-
-        i_sentinel = mocker.MagicMock()
-        j_sentinel = mocker.MagicMock()
-        h_sentinel = mocker.MagicMock()
-        w_sentinel = mocker.MagicMock()
-        v_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._augment.RandomErasing._get_params",
-            return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.prototype.transforms._augment.F.erase")
-        output = transform(inpt_sentinel)
-
-        if p:
-            mock.assert_called_once_with(
-                inpt_sentinel,
-                i=i_sentinel,
-                j=j_sentinel,
-                h=h_sentinel,
-                w=w_sentinel,
-                v=v_sentinel,
-                inplace=transform.inplace,
-            )
-        else:
-            mock.assert_not_called()
-            assert output is inpt_sentinel
-
-
-class TestTransform:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test_check_transformed_types(self, inpt_type, mocker):
-        # This test ensures that we correctly handle which types to transform and which to bypass
-        t = transforms.Transform()
-        inpt = mocker.MagicMock(spec=inpt_type)
-
-        if inpt_type in (np.ndarray, str, int):
-            output = t(inpt)
-            assert output is inpt
-        else:
-            with pytest.raises(NotImplementedError):
-                t(inpt)
-
-
-class TestToImageTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch(
-            "torchvision.prototype.transforms.functional.to_image_tensor",
-            return_value=torch.rand(1, 3, 8, 8),
-        )
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImageTensor()
-        transform(inpt)
-        if inpt_type in (features.BoundingBox, features.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestToImagePIL:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToImagePIL()
-        transform(inpt)
-        if inpt_type in (features.BoundingBox, PIL.Image.Image, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToPILImage:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        transform = transforms.ToPILImage()
-        transform(inpt)
-        if inpt_type in (PIL.Image.Image, features.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt, mode=transform.mode)
-
-
-class TestToTensor:
-    @pytest.mark.parametrize(
-        "inpt_type",
-        [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int],
-    )
-    def test__transform(self, inpt_type, mocker):
-        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
-
-        inpt = mocker.MagicMock(spec=inpt_type)
-        with pytest.warns(UserWarning, match="deprecated and will be removed"):
-            transform = transforms.ToTensor()
-        transform(inpt)
-        if inpt_type in (features.Image, torch.Tensor, features.BoundingBox, str, int):
-            assert fn.call_count == 0
-        else:
-            fn.assert_called_once_with(inpt)
-
-
-class TestContainers:
-    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
-    def test_assertions(self, transform_cls):
-        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
-            transform_cls(transforms.RandomCrop(28))
-
-    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
-    @pytest.mark.parametrize(
-        "trfms",
-        [
-            [transforms.Pad(2), transforms.RandomCrop(28)],
-            [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
-        ],
-    )
-    def test_ctor(self, transform_cls, trfms):
-        c = transform_cls(trfms)
-        inpt = torch.rand(1, 3, 32, 32)
-        output = c(inpt)
-        assert isinstance(output, torch.Tensor)
-        assert output.ndim == 4
-
-
-class TestRandomChoice:
-    def test_assertions(self):
-        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
-
-        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
-
-
-class TestRandomIoUCrop:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
-    def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=features.Image)
-        image.num_channels = 3
-        image.image_size = (24, 32)
-        bboxes = features.BoundingBox(
-            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
-            format="XYXY",
-            image_size=image.image_size,
-            device=device,
-        )
-        sample = [image, bboxes]
-
-        transform = transforms.RandomIoUCrop(sampler_options=options)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params(sample)
-
-            if options == [2.0]:
-                assert len(params) == 0
-                return
-
-            assert len(params["is_within_crop_area"]) > 0
-            assert params["is_within_crop_area"].dtype == torch.bool
-
-            orig_h = image.image_size[0]
-            orig_w = image.image_size[1]
-            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
-            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
-
-            left, top = params["left"], params["top"]
-            new_h, new_w = params["height"], params["width"]
-            ious = box_iou(
-                bboxes,
-                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
-            )
-            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
-
-    def test__transform_empty_params(self, mocker):
-        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
-        image = features.Image(torch.rand(1, 3, 4, 4))
-        bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4))
-        label = features.Label(torch.tensor([1]))
-        sample = [image, bboxes, label]
-        # Let's mock transform._get_params to control the output:
-        transform._get_params = mocker.MagicMock(return_value={})
-        output = transform(sample)
-        torch.testing.assert_close(output, sample)
-
-    def test_forward_assertion(self):
-        transform = transforms.RandomIoUCrop()
-        with pytest.raises(
-            TypeError,
-            match="requires input sample to contain Images or PIL Images, BoundingBoxes and Labels or OneHotLabels",
-        ):
-            transform(torch.tensor(0))
-
-    def test__transform(self, mocker):
-        transform = transforms.RandomIoUCrop()
-
-        image = features.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,))
-        label = features.Label(torch.randint(0, 10, size=(6,)))
-        ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
-        masks = make_detection_mask((32, 24), num_objects=6)
-
-        sample = [image, bboxes, label, ohe_label, masks]
-
-        fn = mocker.patch("torchvision.prototype.transforms.functional.crop", side_effect=lambda x, **params: x)
-        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
-
-        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
-        transform._get_params = mocker.MagicMock(return_value=params)
-        output = transform(sample)
-
-        assert fn.call_count == 3
-
-        expected_calls = [
-            mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-            mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]),
-        ]
-
-        fn.assert_has_calls(expected_calls)
-
-        expected_within_targets = sum(is_within_crop_area)
-
-        # check number of bboxes vs number of labels:
-        output_bboxes = output[1]
-        assert isinstance(output_bboxes, features.BoundingBox)
-        assert len(output_bboxes) == expected_within_targets
-
-        # check labels
-        output_label = output[2]
-        assert isinstance(output_label, features.Label)
-        assert len(output_label) == expected_within_targets
-        torch.testing.assert_close(output_label, label[is_within_crop_area])
-
-        output_ohe_label = output[3]
-        assert isinstance(output_ohe_label, features.OneHotLabel)
-        torch.testing.assert_close(output_ohe_label, ohe_label[is_within_crop_area])
-
-        output_masks = output[4]
-        assert isinstance(output_masks, features.Mask)
-        assert len(output_masks) == expected_within_targets
-
-
-class TestScaleJitter:
-    def test__get_params(self, mocker):
-        image_size = (24, 32)
-        target_size = (16, 12)
-        scale_range = (0.5, 1.5)
-
-        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
-
-        n_samples = 5
-        for _ in range(n_samples):
-
-            params = transform._get_params(sample)
-
-            assert "size" in params
-            size = params["size"]
-
-            assert isinstance(size, tuple) and len(size) == 2
-            height, width = size
-
-            r_min = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[1]
-
-            assert int(image_size[0] * r_min) <= height <= int(image_size[0] * r_max)
-            assert int(image_size[1] * r_min) <= width <= int(image_size[1] * r_max)
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.ScaleJitter(
-            target_size=(16, 12), interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel)
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
-
-class TestRandomShortestSize:
-    def test__get_params(self, mocker):
-        image_size = (3, 10)
-        min_size = [5, 9]
-        max_size = 20
-
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
-
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
-        params = transform._get_params(sample)
-
-        assert "size" in params
-        size = params["size"]
-
-        assert isinstance(size, tuple) and len(size) == 2
-
-        longer = max(size)
-        assert longer <= max_size
-
-        shorter = min(size)
-        if longer == max_size:
-            assert shorter <= max_size
-        else:
-            assert shorter in min_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomShortestSize(
-            min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.RandomShortestSize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock.assert_called_once_with(
-            inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-
-
-class TestSimpleCopyPaste:
-    def create_fake_image(self, mocker, image_type):
-        if image_type == PIL.Image.Image:
-            return PIL.Image.new("RGB", (32, 32), 123)
-        return mocker.MagicMock(spec=image_type)
-
-    def test__extract_image_targets_assertion(self, mocker):
-        transform = transforms.SimpleCopyPaste()
-
-        flat_sample = [
-            # images, batch size = 2
-            self.create_fake_image(mocker, features.Image),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=features.Label),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
-        ]
-
-        with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"):
-            transform._extract_image_targets(flat_sample)
-
-    @pytest.mark.parametrize("image_type", [features.Image, PIL.Image.Image, torch.Tensor])
-    @pytest.mark.parametrize("label_type", [features.Label, features.OneHotLabel])
-    def test__extract_image_targets(self, image_type, label_type, mocker):
-        transform = transforms.SimpleCopyPaste()
-
-        flat_sample = [
-            # images, batch size = 2
-            self.create_fake_image(mocker, image_type),
-            self.create_fake_image(mocker, image_type),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
-            # labels, bboxes, masks
-            mocker.MagicMock(spec=label_type),
-            mocker.MagicMock(spec=features.BoundingBox),
-            mocker.MagicMock(spec=features.Mask),
-        ]
-
-        images, targets = transform._extract_image_targets(flat_sample)
-
-        assert len(images) == len(targets) == 2
-        if image_type == PIL.Image.Image:
-            torch.testing.assert_close(images[0], pil_to_tensor(flat_sample[0]))
-            torch.testing.assert_close(images[1], pil_to_tensor(flat_sample[1]))
-        else:
-            assert images[0] == flat_sample[0]
-            assert images[1] == flat_sample[1]
-
-        for target in targets:
-            for key, type_ in [
-                ("boxes", features.BoundingBox),
-                ("masks", features.Mask),
-                ("labels", label_type),
-            ]:
-                assert key in target
-                assert isinstance(target[key], type_)
-                assert target[key] in flat_sample
-
-    @pytest.mark.parametrize("label_type", [features.Label, features.OneHotLabel])
-    def test__copy_paste(self, label_type):
-        image = 2 * torch.ones(3, 32, 32)
-        masks = torch.zeros(2, 32, 32)
-        masks[0, 3:9, 2:8] = 1
-        masks[1, 20:30, 20:30] = 1
-        labels = torch.tensor([1, 2])
-        blending = True
-        resize_interpolation = InterpolationMode.BILINEAR
-        antialias = None
-        if label_type == features.OneHotLabel:
-            labels = torch.nn.functional.one_hot(labels, num_classes=5)
-        target = {
-            "boxes": features.BoundingBox(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32)
-            ),
-            "masks": features.Mask(masks),
-            "labels": label_type(labels),
-        }
-
-        paste_image = 10 * torch.ones(3, 32, 32)
-        paste_masks = torch.zeros(2, 32, 32)
-        paste_masks[0, 13:19, 12:18] = 1
-        paste_masks[1, 15:19, 1:8] = 1
-        paste_labels = torch.tensor([3, 4])
-        if label_type == features.OneHotLabel:
-            paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
-        paste_target = {
-            "boxes": features.BoundingBox(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32)
-            ),
-            "masks": features.Mask(paste_masks),
-            "labels": label_type(paste_labels),
-        }
-
-        transform = transforms.SimpleCopyPaste()
-        random_selection = torch.tensor([0, 1])
-        output_image, output_target = transform._copy_paste(
-            image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias
-        )
-
-        assert output_image.unique().tolist() == [2, 10]
-        assert output_target["boxes"].shape == (4, 4)
-        torch.testing.assert_close(output_target["boxes"][:2, :], target["boxes"])
-        torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"])
-
-        expected_labels = torch.tensor([1, 2, 3, 4])
-        if label_type == features.OneHotLabel:
-            expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5)
-        torch.testing.assert_close(output_target["labels"], label_type(expected_labels))
-
-        assert output_target["masks"].shape == (4, 32, 32)
-        torch.testing.assert_close(output_target["masks"][:2, :], target["masks"])
-        torch.testing.assert_close(output_target["masks"][2:, :], paste_target["masks"])
-
-
-class TestFixedSizeCrop:
-    def test__get_params(self, mocker):
-        crop_size = (7, 7)
-        batch_shape = (10,)
-        image_size = (11, 5)
-
-        transform = transforms.FixedSizeCrop(size=crop_size)
-
-        sample = dict(
-            image=make_image(size=image_size, color_space=features.ColorSpace.RGB),
-            bounding_boxes=make_bounding_box(
-                format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape
-            ),
-        )
-        params = transform._get_params(sample)
-
-        assert params["needs_crop"]
-        assert params["height"] <= crop_size[0]
-        assert params["width"] <= crop_size[1]
-
-        assert (
-            isinstance(params["is_valid"], torch.Tensor)
-            and params["is_valid"].dtype is torch.bool
-            and params["is_valid"].shape == batch_shape
-        )
-
-        assert params["needs_pad"]
-        assert any(pad > 0 for pad in params["padding"])
-
-    @pytest.mark.parametrize("needs", list(itertools.product((False, True), repeat=2)))
-    def test__transform(self, mocker, needs):
-        fill_sentinel = 12
-        padding_mode_sentinel = mocker.MagicMock()
-
-        transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel)
-        transform._transformed_types = (mocker.MagicMock,)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        needs_crop, needs_pad = needs
-        top_sentinel = mocker.MagicMock()
-        left_sentinel = mocker.MagicMock()
-        height_sentinel = mocker.MagicMock()
-        width_sentinel = mocker.MagicMock()
-        is_valid = mocker.MagicMock() if needs_crop else None
-        padding_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=needs_crop,
-                top=top_sentinel,
-                left=left_sentinel,
-                height=height_sentinel,
-                width=width_sentinel,
-                is_valid=is_valid,
-                padding=padding_sentinel,
-                needs_pad=needs_pad,
-            ),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_crop = mocker.patch("torchvision.prototype.transforms._geometry.F.crop")
-        mock_pad = mocker.patch("torchvision.prototype.transforms._geometry.F.pad")
-        transform(inpt_sentinel)
-
-        if needs_crop:
-            mock_crop.assert_called_once_with(
-                inpt_sentinel,
-                top=top_sentinel,
-                left=left_sentinel,
-                height=height_sentinel,
-                width=width_sentinel,
-            )
-        else:
-            mock_crop.assert_not_called()
-
-        if needs_pad:
-            # If we cropped before, the input to F.pad is no longer inpt_sentinel. Thus, we can't use
-            # `MagicMock.assert_called_once_with` and have to perform the checks manually
-            mock_pad.assert_called_once()
-            args, kwargs = mock_pad.call_args
-            if not needs_crop:
-                assert args[0] is inpt_sentinel
-            assert args[1] is padding_sentinel
-            fill_sentinel = transforms.functional._geometry._convert_fill_arg(fill_sentinel)
-            assert kwargs == dict(fill=fill_sentinel, padding_mode=padding_mode_sentinel)
-        else:
-            mock_pad.assert_not_called()
-
-    def test__transform_culling(self, mocker):
-        batch_size = 10
-        image_size = (10, 10)
-
-        is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=True,
-                top=0,
-                left=0,
-                height=image_size[0],
-                width=image_size[1],
-                is_valid=is_valid,
-                needs_pad=False,
-            ),
-        )
-
-        bounding_boxes = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
-        )
-        masks = make_detection_mask(size=image_size, extra_dims=(batch_size,))
-        labels = make_label(extra_dims=(batch_size,))
-
-        transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        output = transform(
-            dict(
-                bounding_boxes=bounding_boxes,
-                masks=masks,
-                labels=labels,
-            )
-        )
-
-        assert_equal(output["bounding_boxes"], bounding_boxes[is_valid])
-        assert_equal(output["masks"], masks[is_valid])
-        assert_equal(output["labels"], labels[is_valid])
-
-    def test__transform_bounding_box_clamping(self, mocker):
-        batch_size = 3
-        image_size = (10, 10)
-
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
-            return_value=dict(
-                needs_crop=True,
-                top=0,
-                left=0,
-                height=image_size[0],
-                width=image_size[1],
-                is_valid=torch.full((batch_size,), fill_value=True),
-                needs_pad=False,
-            ),
-        )
-
-        bounding_box = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
-        )
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
-
-        transform = transforms.FixedSizeCrop((-1, -1))
-        mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True)
-        mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True)
-
-        transform(bounding_box)
-
-        mock.assert_called_once()
-
-
-class TestLinearTransformation:
-    def test_assertions(self):
-        with pytest.raises(ValueError, match="transformation_matrix should be square"):
-            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
-
-        with pytest.raises(ValueError, match="mean_vector should have the same length"):
-            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
-
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            122 * torch.ones(1, 3, 8, 8),
-            122.0 * torch.ones(1, 3, 8, 8),
-            features.Image(122 * torch.ones(1, 3, 8, 8)),
-            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
-        ],
-    )
-    def test__transform(self, inpt):
-
-        v = 121 * torch.ones(3 * 8 * 8)
-        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
-        transform = transforms.LinearTransformation(m, v)
-
-        if isinstance(inpt, PIL.Image.Image):
-            with pytest.raises(TypeError, match="LinearTransformation does not work on PIL Images"):
-                transform(inpt)
-        else:
-            output = transform(inpt)
-            assert isinstance(output, torch.Tensor)
-            assert output.unique() == 3 * 8 * 8
-            assert output.dtype == inpt.dtype
-
-
-class TestLabelToOneHot:
-    def test__transform(self):
-        categories = ["apple", "pear", "pineapple"]
-        labels = features.Label(torch.tensor([0, 1, 2, 1]), categories=categories)
-        transform = transforms.LabelToOneHot()
-        ohe_labels = transform(labels)
-        assert isinstance(ohe_labels, features.OneHotLabel)
-        assert ohe_labels.shape == (4, 3)
-        assert ohe_labels.categories == labels.categories == categories
-
-
-class TestRandomResize:
-    def test__get_params(self):
-        min_size = 3
-        max_size = 6
-
-        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
-
-        for _ in range(10):
-            params = transform._get_params(None)
-
-            assert isinstance(params["size"], list) and len(params["size"]) == 1
-            size = params["size"][0]
-
-            assert min_size <= size < max_size
-
-    def test__transform(self, mocker):
-        interpolation_sentinel = mocker.MagicMock()
-        antialias_sentinel = mocker.MagicMock()
-
-        transform = transforms.RandomResize(
-            min_size=-1, max_size=-1, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
-        transform._transformed_types = (mocker.MagicMock,)
-
-        size_sentinel = mocker.MagicMock()
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.RandomResize._get_params",
-            return_value=dict(size=size_sentinel),
-        )
-
-        inpt_sentinel = mocker.MagicMock()
-
-        mock_resize = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
-        transform(inpt_sentinel)
-
-        mock_resize.assert_called_with(
-            inpt_sentinel, size_sentinel, interpolation=interpolation_sentinel, antialias=antialias_sentinel
-        )
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
deleted file mode 100644
index c8debe1e293..00000000000
--- a/test/test_prototype_transforms_consistency.py
+++ /dev/null
@@ -1,1097 +0,0 @@
-import enum
-import inspect
-import random
-from collections import defaultdict
-from importlib.machinery import SourceFileLoader
-from pathlib import Path
-
-import numpy as np
-import PIL.Image
-import pytest
-
-import torch
-from prototype_common_utils import (
-    ArgsKwargs,
-    assert_equal,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_images,
-    make_label,
-    make_segmentation_mask,
-)
-from torchvision import transforms as legacy_transforms
-from torchvision._utils import sequence_to_str
-from torchvision.prototype import features, transforms as prototype_transforms
-from torchvision.prototype.transforms import functional as F
-from torchvision.prototype.transforms._utils import query_chw
-from torchvision.prototype.transforms.functional import to_image_pil
-
-DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)])
-
-
-class ConsistencyConfig:
-    def __init__(
-        self,
-        prototype_cls,
-        legacy_cls,
-        # If no args_kwargs is passed, only the signature will be checked
-        args_kwargs=(),
-        make_images_kwargs=None,
-        supports_pil=True,
-        removed_params=(),
-    ):
-        self.prototype_cls = prototype_cls
-        self.legacy_cls = legacy_cls
-        self.args_kwargs = args_kwargs
-        self.make_images_kwargs = make_images_kwargs or DEFAULT_MAKE_IMAGES_KWARGS
-        self.supports_pil = supports_pil
-        self.removed_params = removed_params
-
-
-# These are here since both the prototype and legacy transform need to be constructed with the same random parameters
-LINEAR_TRANSFORMATION_MEAN = torch.rand(36)
-LINEAR_TRANSFORMATION_MATRIX = torch.rand([LINEAR_TRANSFORMATION_MEAN.numel()] * 2)
-
-CONSISTENCY_CONFIGS = [
-    ConsistencyConfig(
-        prototype_transforms.Normalize,
-        legacy_transforms.Normalize,
-        [
-            ArgsKwargs(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
-        ],
-        supports_pil=False,
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.float]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.Resize,
-        legacy_transforms.Resize,
-        [
-            ArgsKwargs(32),
-            ArgsKwargs((32, 29)),
-            ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
-            # FIXME: these are currently failing, since the new transform only supports the enum. The int input is
-            #  already deprecated and scheduled to be removed in 0.15. Should we support ints on the prototype
-            #  transform? I guess it depends if we roll out before 0.15 or not.
-            # ArgsKwargs((30, 27), interpolation=0),
-            # ArgsKwargs((35, 29), interpolation=2),
-            # ArgsKwargs((34, 25), interpolation=3),
-            ArgsKwargs(31, max_size=32),
-            ArgsKwargs(30, max_size=100),
-            ArgsKwargs((29, 32), antialias=False),
-            ArgsKwargs((28, 31), antialias=True),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.CenterCrop,
-        legacy_transforms.CenterCrop,
-        [
-            ArgsKwargs(18),
-            ArgsKwargs((18, 13)),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.FiveCrop,
-        legacy_transforms.FiveCrop,
-        [
-            ArgsKwargs(18),
-            ArgsKwargs((18, 13)),
-        ],
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.TenCrop,
-        legacy_transforms.TenCrop,
-        [
-            ArgsKwargs(18),
-            ArgsKwargs((18, 13)),
-            ArgsKwargs(18, vertical_flip=True),
-        ],
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.Pad,
-        legacy_transforms.Pad,
-        [
-            ArgsKwargs(3),
-            ArgsKwargs([3]),
-            ArgsKwargs([2, 3]),
-            ArgsKwargs([3, 2, 1, 4]),
-            ArgsKwargs(5, fill=1, padding_mode="constant"),
-            ArgsKwargs(5, padding_mode="edge"),
-            ArgsKwargs(5, padding_mode="reflect"),
-            ArgsKwargs(5, padding_mode="symmetric"),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.LinearTransformation,
-        legacy_transforms.LinearTransformation,
-        [
-            ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX, LINEAR_TRANSFORMATION_MEAN),
-        ],
-        # Make sure that the product of the height, width and number of channels matches the number of elements in
-        # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36.
-        make_images_kwargs=dict(
-            DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=[features.ColorSpace.RGB]
-        ),
-        supports_pil=False,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.Grayscale,
-        legacy_transforms.Grayscale,
-        [
-            ArgsKwargs(num_output_channels=1),
-            ArgsKwargs(num_output_channels=3),
-        ],
-        make_images_kwargs=dict(
-            DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=[features.ColorSpace.RGB, features.ColorSpace.GRAY]
-        ),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.ConvertImageDtype,
-        legacy_transforms.ConvertImageDtype,
-        [
-            ArgsKwargs(torch.float16),
-            ArgsKwargs(torch.bfloat16),
-            ArgsKwargs(torch.float32),
-            ArgsKwargs(torch.float64),
-            ArgsKwargs(torch.uint8),
-        ],
-        supports_pil=False,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.ToPILImage,
-        legacy_transforms.ToPILImage,
-        [ArgsKwargs()],
-        make_images_kwargs=dict(
-            color_spaces=[
-                features.ColorSpace.GRAY,
-                features.ColorSpace.GRAY_ALPHA,
-                features.ColorSpace.RGB,
-                features.ColorSpace.RGB_ALPHA,
-            ],
-            extra_dims=[()],
-        ),
-        supports_pil=False,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.Lambda,
-        legacy_transforms.Lambda,
-        [
-            ArgsKwargs(lambda image: image / 2),
-        ],
-        # Technically, this also supports PIL, but it is overkill to write a function here that supports tensor and PIL
-        # images given that the transform does nothing but call it anyway.
-        supports_pil=False,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomHorizontalFlip,
-        legacy_transforms.RandomHorizontalFlip,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomVerticalFlip,
-        legacy_transforms.RandomVerticalFlip,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomEqualize,
-        legacy_transforms.RandomEqualize,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomInvert,
-        legacy_transforms.RandomInvert,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomPosterize,
-        legacy_transforms.RandomPosterize,
-        [
-            ArgsKwargs(p=0, bits=5),
-            ArgsKwargs(p=1, bits=1),
-            ArgsKwargs(p=1, bits=3),
-        ],
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomSolarize,
-        legacy_transforms.RandomSolarize,
-        [
-            ArgsKwargs(p=0, threshold=0.5),
-            ArgsKwargs(p=1, threshold=0.3),
-            ArgsKwargs(p=1, threshold=0.99),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomAutocontrast,
-        legacy_transforms.RandomAutocontrast,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomAdjustSharpness,
-        legacy_transforms.RandomAdjustSharpness,
-        [
-            ArgsKwargs(p=0, sharpness_factor=0.5),
-            ArgsKwargs(p=1, sharpness_factor=0.3),
-            ArgsKwargs(p=1, sharpness_factor=0.99),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomGrayscale,
-        legacy_transforms.RandomGrayscale,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomResizedCrop,
-        legacy_transforms.RandomResizedCrop,
-        [
-            ArgsKwargs(16),
-            ArgsKwargs(17, scale=(0.3, 0.7)),
-            ArgsKwargs(25, ratio=(0.5, 1.5)),
-            ArgsKwargs((31, 28), interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs((33, 26), interpolation=prototype_transforms.InterpolationMode.BICUBIC),
-            ArgsKwargs((29, 32), antialias=False),
-            ArgsKwargs((28, 31), antialias=True),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomErasing,
-        legacy_transforms.RandomErasing,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-            ArgsKwargs(p=1, scale=(0.3, 0.7)),
-            ArgsKwargs(p=1, ratio=(0.5, 1.5)),
-            ArgsKwargs(p=1, value=1),
-            ArgsKwargs(p=1, value=(1, 2, 3)),
-            ArgsKwargs(p=1, value="random"),
-        ],
-        supports_pil=False,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.ColorJitter,
-        legacy_transforms.ColorJitter,
-        [
-            ArgsKwargs(),
-            ArgsKwargs(brightness=0.1),
-            ArgsKwargs(brightness=(0.2, 0.3)),
-            ArgsKwargs(contrast=0.4),
-            ArgsKwargs(contrast=(0.5, 0.6)),
-            ArgsKwargs(saturation=0.7),
-            ArgsKwargs(saturation=(0.8, 0.9)),
-            ArgsKwargs(hue=0.3),
-            ArgsKwargs(hue=(-0.1, 0.2)),
-            ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.7, hue=0.3),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.ElasticTransform,
-        legacy_transforms.ElasticTransform,
-        [
-            ArgsKwargs(),
-            ArgsKwargs(alpha=20.0),
-            ArgsKwargs(alpha=(15.3, 27.2)),
-            ArgsKwargs(sigma=3.0),
-            ArgsKwargs(sigma=(2.5, 3.9)),
-            ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs(interpolation=prototype_transforms.InterpolationMode.BICUBIC),
-            ArgsKwargs(fill=1),
-        ],
-        # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(163, 163), (72, 333), (313, 95)]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.GaussianBlur,
-        legacy_transforms.GaussianBlur,
-        [
-            ArgsKwargs(kernel_size=3),
-            ArgsKwargs(kernel_size=(1, 5)),
-            ArgsKwargs(kernel_size=3, sigma=0.7),
-            ArgsKwargs(kernel_size=5, sigma=(0.3, 1.4)),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomAffine,
-        legacy_transforms.RandomAffine,
-        [
-            ArgsKwargs(degrees=30.0),
-            ArgsKwargs(degrees=(-20.0, 10.0)),
-            ArgsKwargs(degrees=0.0, translate=(0.4, 0.6)),
-            ArgsKwargs(degrees=0.0, scale=(0.3, 0.8)),
-            ArgsKwargs(degrees=0.0, shear=13),
-            ArgsKwargs(degrees=0.0, shear=(8, 17)),
-            ArgsKwargs(degrees=0.0, shear=(4, 5, 4, 13)),
-            ArgsKwargs(degrees=(-20.0, 10.0), translate=(0.4, 0.6), scale=(0.3, 0.8), shear=(4, 5, 4, 13)),
-            ArgsKwargs(degrees=30.0, interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs(degrees=30.0, fill=1),
-            ArgsKwargs(degrees=30.0, fill=(2, 3, 4)),
-            ArgsKwargs(degrees=30.0, center=(0, 0)),
-        ],
-        removed_params=["fillcolor", "resample"],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomCrop,
-        legacy_transforms.RandomCrop,
-        [
-            ArgsKwargs(12),
-            ArgsKwargs((15, 17)),
-            ArgsKwargs(11, padding=1),
-            ArgsKwargs((8, 13), padding=(2, 3)),
-            ArgsKwargs((14, 9), padding=(0, 2, 1, 0)),
-            ArgsKwargs(36, pad_if_needed=True),
-            ArgsKwargs((7, 8), fill=1),
-            ArgsKwargs(5, fill=(1, 2, 3)),
-            ArgsKwargs(12),
-            ArgsKwargs(15, padding=2, padding_mode="edge"),
-            ArgsKwargs(17, padding=(1, 0), padding_mode="reflect"),
-            ArgsKwargs(8, padding=(3, 0, 0, 1), padding_mode="symmetric"),
-        ],
-        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(26, 26), (18, 33), (29, 22)]),
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomPerspective,
-        legacy_transforms.RandomPerspective,
-        [
-            ArgsKwargs(p=0),
-            ArgsKwargs(p=1),
-            ArgsKwargs(p=1, distortion_scale=0.3),
-            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=prototype_transforms.InterpolationMode.NEAREST),
-            ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
-            ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
-        ],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomRotation,
-        legacy_transforms.RandomRotation,
-        [
-            ArgsKwargs(degrees=30.0),
-            ArgsKwargs(degrees=(-20.0, 10.0)),
-            ArgsKwargs(degrees=30.0, interpolation=prototype_transforms.InterpolationMode.BILINEAR),
-            ArgsKwargs(degrees=30.0, expand=True),
-            ArgsKwargs(degrees=30.0, center=(0, 0)),
-            ArgsKwargs(degrees=30.0, fill=1),
-            ArgsKwargs(degrees=30.0, fill=(1, 2, 3)),
-        ],
-        removed_params=["resample"],
-    ),
-    ConsistencyConfig(
-        prototype_transforms.PILToTensor,
-        legacy_transforms.PILToTensor,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.ToTensor,
-        legacy_transforms.ToTensor,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.Compose,
-        legacy_transforms.Compose,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomApply,
-        legacy_transforms.RandomApply,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomChoice,
-        legacy_transforms.RandomChoice,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandomOrder,
-        legacy_transforms.RandomOrder,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.AugMix,
-        legacy_transforms.AugMix,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.AutoAugment,
-        legacy_transforms.AutoAugment,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.RandAugment,
-        legacy_transforms.RandAugment,
-    ),
-    ConsistencyConfig(
-        prototype_transforms.TrivialAugmentWide,
-        legacy_transforms.TrivialAugmentWide,
-    ),
-]
-
-
-def test_automatic_coverage():
-    available = {
-        name
-        for name, obj in legacy_transforms.__dict__.items()
-        if not name.startswith("_") and isinstance(obj, type) and not issubclass(obj, enum.Enum)
-    }
-
-    checked = {config.legacy_cls.__name__ for config in CONSISTENCY_CONFIGS}
-
-    missing = available - checked
-    if missing:
-        raise AssertionError(
-            f"The prototype transformations {sequence_to_str(sorted(missing), separate_last='and ')} "
-            f"are not checked for consistency although a legacy counterpart exists."
-        )
-
-
-@pytest.mark.parametrize("config", CONSISTENCY_CONFIGS, ids=lambda config: config.legacy_cls.__name__)
-def test_signature_consistency(config):
-    legacy_params = dict(inspect.signature(config.legacy_cls).parameters)
-    prototype_params = dict(inspect.signature(config.prototype_cls).parameters)
-
-    for param in config.removed_params:
-        legacy_params.pop(param, None)
-
-    missing = legacy_params.keys() - prototype_params.keys()
-    if missing:
-        raise AssertionError(
-            f"The prototype transform does not support the parameters "
-            f"{sequence_to_str(sorted(missing), separate_last='and ')}, but the legacy transform does. "
-            f"If that is intentional, e.g. pending deprecation, please add the parameters to the `removed_params` on "
-            f"the `ConsistencyConfig`."
-        )
-
-    extra = prototype_params.keys() - legacy_params.keys()
-    extra_without_default = {
-        param
-        for param in extra
-        if prototype_params[param].default is inspect.Parameter.empty
-        and prototype_params[param].kind not in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
-    }
-    if extra_without_default:
-        raise AssertionError(
-            f"The prototype transform requires the parameters "
-            f"{sequence_to_str(sorted(extra_without_default), separate_last='and ')}, but the legacy transform does "
-            f"not. Please add a default value."
-        )
-
-    legacy_kinds = {name: param.kind for name, param in legacy_params.items()}
-    prototype_kinds = {name: prototype_params[name].kind for name in legacy_kinds.keys()}
-    assert prototype_kinds == legacy_kinds
-
-
-def check_call_consistency(prototype_transform, legacy_transform, images=None, supports_pil=True):
-    if images is None:
-        images = make_images(**DEFAULT_MAKE_IMAGES_KWARGS)
-
-    for image in images:
-        image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]"
-
-        image_tensor = torch.Tensor(image)
-
-        try:
-            torch.manual_seed(0)
-            output_legacy_tensor = legacy_transform(image_tensor)
-        except Exception as exc:
-            raise pytest.UsageError(
-                f"Transforming a tensor image {image_repr} failed in the legacy transform with the "
-                f"error above. This means that you need to specify the parameters passed to `make_images` through the "
-                "`make_images_kwargs` of the `ConsistencyConfig`."
-            ) from exc
-
-        try:
-            torch.manual_seed(0)
-            output_prototype_tensor = prototype_transform(image_tensor)
-        except Exception as exc:
-            raise AssertionError(
-                f"Transforming a tensor image with shape {image_repr} failed in the prototype transform with "
-                f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                f"`is_simple_tensor` path in `_transform`."
-            ) from exc
-
-        assert_equal(
-            output_prototype_tensor,
-            output_legacy_tensor,
-            msg=lambda msg: f"Tensor image consistency check failed with: \n\n{msg}",
-        )
-
-        try:
-            torch.manual_seed(0)
-            output_prototype_image = prototype_transform(image)
-        except Exception as exc:
-            raise AssertionError(
-                f"Transforming a feature image with shape {image_repr} failed in the prototype transform with "
-                f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                f"`features.Image` path in `_transform`."
-            ) from exc
-
-        assert_equal(
-            output_prototype_image,
-            output_prototype_tensor,
-            msg=lambda msg: f"Output for feature and tensor images is not equal: \n\n{msg}",
-        )
-
-        if image.ndim == 3 and supports_pil:
-            image_pil = to_image_pil(image)
-
-            try:
-                torch.manual_seed(0)
-                output_legacy_pil = legacy_transform(image_pil)
-            except Exception as exc:
-                raise pytest.UsageError(
-                    f"Transforming a PIL image with shape {image_repr} failed in the legacy transform with the "
-                    f"error above. If this transform does not support PIL images, set `supports_pil=False` on the "
-                    "`ConsistencyConfig`. "
-                ) from exc
-
-            try:
-                torch.manual_seed(0)
-                output_prototype_pil = prototype_transform(image_pil)
-            except Exception as exc:
-                raise AssertionError(
-                    f"Transforming a PIL image with shape {image_repr} failed in the prototype transform with "
-                    f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                    f"`PIL.Image.Image` path in `_transform`."
-                ) from exc
-
-            assert_equal(
-                output_prototype_pil,
-                output_legacy_pil,
-                msg=lambda msg: f"PIL image consistency check failed with: \n\n{msg}",
-            )
-
-
-@pytest.mark.parametrize(
-    ("config", "args_kwargs"),
-    [
-        pytest.param(config, args_kwargs, id=f"{config.legacy_cls.__name__}({args_kwargs})")
-        for config in CONSISTENCY_CONFIGS
-        for args_kwargs in config.args_kwargs
-    ],
-)
-def test_call_consistency(config, args_kwargs):
-    args, kwargs = args_kwargs
-
-    try:
-        legacy_transform = config.legacy_cls(*args, **kwargs)
-    except Exception as exc:
-        raise pytest.UsageError(
-            f"Initializing the legacy transform failed with the error above. "
-            f"Please correct the `ArgsKwargs({args_kwargs})` in the `ConsistencyConfig`."
-        ) from exc
-
-    try:
-        prototype_transform = config.prototype_cls(*args, **kwargs)
-    except Exception as exc:
-        raise AssertionError(
-            "Initializing the prototype transform failed with the error above. "
-            "This means there is a consistency bug in the constructor."
-        ) from exc
-
-    check_call_consistency(
-        prototype_transform,
-        legacy_transform,
-        images=make_images(**config.make_images_kwargs),
-        supports_pil=config.supports_pil,
-    )
-
-
-class TestContainerTransforms:
-    """
-    Since we are testing containers here, we also need some transforms to wrap. Thus, testing a container transform for
-    consistency automatically tests the wrapped transforms consistency.
-
-    Instead of complicated mocking or creating custom transforms just for these tests, here we use deterministic ones
-    that were already tested for consistency above.
-    """
-
-    def test_compose(self):
-        prototype_transform = prototype_transforms.Compose(
-            [
-                prototype_transforms.Resize(256),
-                prototype_transforms.CenterCrop(224),
-            ]
-        )
-        legacy_transform = legacy_transforms.Compose(
-            [
-                legacy_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
-            ]
-        )
-
-        check_call_consistency(prototype_transform, legacy_transform)
-
-    @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1])
-    def test_random_apply(self, p):
-        prototype_transform = prototype_transforms.RandomApply(
-            [
-                prototype_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
-            ],
-            p=p,
-        )
-        legacy_transform = legacy_transforms.RandomApply(
-            [
-                legacy_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
-            ],
-            p=p,
-        )
-
-        check_call_consistency(prototype_transform, legacy_transform)
-
-    # We can't test other values for `p` since the random parameter generation is different
-    @pytest.mark.parametrize("p", [(0, 1), (1, 0)])
-    def test_random_choice(self, p):
-        prototype_transform = prototype_transforms.RandomChoice(
-            [
-                prototype_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
-            ],
-            p=p,
-        )
-        legacy_transform = legacy_transforms.RandomChoice(
-            [
-                legacy_transforms.Resize(256),
-                legacy_transforms.CenterCrop(224),
-            ],
-            p=p,
-        )
-
-        check_call_consistency(prototype_transform, legacy_transform)
-
-
-class TestToTensorTransforms:
-    def test_pil_to_tensor(self):
-        prototype_transform = prototype_transforms.PILToTensor()
-        legacy_transform = legacy_transforms.PILToTensor()
-
-        for image in make_images(extra_dims=[()]):
-            image_pil = to_image_pil(image)
-
-            assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
-
-    def test_to_tensor(self):
-        prototype_transform = prototype_transforms.ToTensor()
-        legacy_transform = legacy_transforms.ToTensor()
-
-        for image in make_images(extra_dims=[()]):
-            image_pil = to_image_pil(image)
-            image_numpy = np.array(image_pil)
-
-            assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
-            assert_equal(prototype_transform(image_numpy), legacy_transform(image_numpy))
-
-
-class TestAATransforms:
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
-            PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
-    )
-    def test_randaug(self, inpt, interpolation, mocker):
-        t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1)
-        t = prototype_transforms.RandAugment(interpolation=interpolation, num_ops=1)
-
-        le = len(t._AUGMENTATION_SPACE)
-        keys = list(t._AUGMENTATION_SPACE.keys())
-        randint_values = []
-        for i in range(le):
-            # Stable API, op_index random call
-            randint_values.append(i)
-            # Stable API, if signed there is another random call
-            if t._AUGMENTATION_SPACE[keys[i]][1]:
-                randint_values.append(0)
-            # New API, _get_random_item
-            randint_values.append(i)
-        randint_values = iter(randint_values)
-
-        mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values)))
-        mocker.patch("torch.rand", return_value=1.0)
-
-        for i in range(le):
-            expected_output = t_ref(inpt)
-            output = t(inpt)
-
-            assert_equal(expected_output, output)
-
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
-            PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
-    )
-    def test_trivial_aug(self, inpt, interpolation, mocker):
-        t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation)
-        t = prototype_transforms.TrivialAugmentWide(interpolation=interpolation)
-
-        le = len(t._AUGMENTATION_SPACE)
-        keys = list(t._AUGMENTATION_SPACE.keys())
-        randint_values = []
-        for i in range(le):
-            # Stable API, op_index random call
-            randint_values.append(i)
-            key = keys[i]
-            # Stable API, random magnitude
-            aug_op = t._AUGMENTATION_SPACE[key]
-            magnitudes = aug_op[0](2, 0, 0)
-            if magnitudes is not None:
-                randint_values.append(5)
-            # Stable API, if signed there is another random call
-            if aug_op[1]:
-                randint_values.append(0)
-            # New API, _get_random_item
-            randint_values.append(i)
-            # New API, random magnitude
-            if magnitudes is not None:
-                randint_values.append(5)
-
-        randint_values = iter(randint_values)
-
-        mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values)))
-        mocker.patch("torch.rand", return_value=1.0)
-
-        for _ in range(le):
-            expected_output = t_ref(inpt)
-            output = t(inpt)
-
-            assert_equal(expected_output, output)
-
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
-            PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
-    )
-    def test_augmix(self, inpt, interpolation, mocker):
-        t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
-        t_ref._sample_dirichlet = lambda t: t.softmax(dim=-1)
-        t = prototype_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
-        t._sample_dirichlet = lambda t: t.softmax(dim=-1)
-
-        le = len(t._AUGMENTATION_SPACE)
-        keys = list(t._AUGMENTATION_SPACE.keys())
-        randint_values = []
-        for i in range(le):
-            # Stable API, op_index random call
-            randint_values.append(i)
-            key = keys[i]
-            # Stable API, random magnitude
-            aug_op = t._AUGMENTATION_SPACE[key]
-            magnitudes = aug_op[0](2, 0, 0)
-            if magnitudes is not None:
-                randint_values.append(5)
-            # Stable API, if signed there is another random call
-            if aug_op[1]:
-                randint_values.append(0)
-            # New API, _get_random_item
-            randint_values.append(i)
-            # New API, random magnitude
-            if magnitudes is not None:
-                randint_values.append(5)
-
-        randint_values = iter(randint_values)
-
-        mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values)))
-        mocker.patch("torch.rand", return_value=1.0)
-
-        expected_output = t_ref(inpt)
-        output = t(inpt)
-
-        assert_equal(expected_output, output)
-
-    @pytest.mark.parametrize(
-        "inpt",
-        [
-            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
-            PIL.Image.new("RGB", (256, 256), 123),
-            features.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "interpolation",
-        [prototype_transforms.InterpolationMode.NEAREST, prototype_transforms.InterpolationMode.BILINEAR],
-    )
-    def test_aa(self, inpt, interpolation):
-        aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet")
-        t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation)
-        t = prototype_transforms.AutoAugment(aa_policy, interpolation=interpolation)
-
-        torch.manual_seed(12)
-        expected_output = t_ref(inpt)
-
-        torch.manual_seed(12)
-        output = t(inpt)
-
-        assert_equal(expected_output, output)
-
-
-def import_transforms_from_references(reference):
-    ref_det_filepath = Path(__file__).parent.parent / "references" / reference / "transforms.py"
-    return SourceFileLoader(ref_det_filepath.stem, ref_det_filepath.as_posix()).load_module()
-
-
-det_transforms = import_transforms_from_references("detection")
-
-
-class TestRefDetTransforms:
-    def make_datapoints(self, with_mask=True):
-        size = (600, 800)
-        num_objects = 22
-
-        pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB))
-        target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
-            "labels": make_label(extra_dims=(num_objects,), categories=80),
-        }
-        if with_mask:
-            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
-
-        yield (pil_image, target)
-
-        tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB))
-        target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
-            "labels": make_label(extra_dims=(num_objects,), categories=80),
-        }
-        if with_mask:
-            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
-
-        yield (tensor_image, target)
-
-        feature_image = make_image(size=size, color_space=features.ColorSpace.RGB)
-        target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
-            "labels": make_label(extra_dims=(num_objects,), categories=80),
-        }
-        if with_mask:
-            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
-
-        yield (feature_image, target)
-
-    @pytest.mark.parametrize(
-        "t_ref, t, data_kwargs",
-        [
-            (det_transforms.RandomHorizontalFlip(p=1.0), prototype_transforms.RandomHorizontalFlip(p=1.0), {}),
-            (det_transforms.RandomIoUCrop(), prototype_transforms.RandomIoUCrop(), {"with_mask": False}),
-            (det_transforms.RandomZoomOut(), prototype_transforms.RandomZoomOut(), {"with_mask": False}),
-            (det_transforms.ScaleJitter((1024, 1024)), prototype_transforms.ScaleJitter((1024, 1024)), {}),
-            (
-                det_transforms.FixedSizeCrop((1024, 1024), fill=0),
-                prototype_transforms.FixedSizeCrop((1024, 1024), fill=0),
-                {},
-            ),
-            (
-                det_transforms.RandomShortestSize(
-                    min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
-                ),
-                prototype_transforms.RandomShortestSize(
-                    min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
-                ),
-                {},
-            ),
-        ],
-    )
-    def test_transform(self, t_ref, t, data_kwargs):
-        for dp in self.make_datapoints(**data_kwargs):
-
-            # We should use prototype transform first as reference transform performs inplace target update
-            torch.manual_seed(12)
-            output = t(dp)
-
-            torch.manual_seed(12)
-            expected_output = t_ref(*dp)
-
-            assert_equal(expected_output, output)
-
-
-seg_transforms = import_transforms_from_references("segmentation")
-
-
-# We need this transform for two reasons:
-# 1. transforms.RandomCrop uses a different scheme to pad images and masks of insufficient size than its name
-#    counterpart in the detection references. Thus, we cannot use it with `pad_if_needed=True`
-# 2. transforms.Pad only supports a fixed padding, but the segmentation datasets don't have a fixed image size.
-class PadIfSmaller(prototype_transforms.Transform):
-    def __init__(self, size, fill=0):
-        super().__init__()
-        self.size = size
-        self.fill = prototype_transforms._geometry._setup_fill_arg(fill)
-
-    def _get_params(self, sample):
-        _, height, width = query_chw(sample)
-        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
-        needs_padding = any(padding)
-        return dict(padding=padding, needs_padding=needs_padding)
-
-    def _transform(self, inpt, params):
-        if not params["needs_padding"]:
-            return inpt
-
-        fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-
-        return F.pad(inpt, padding=params["padding"], fill=fill)
-
-
-class TestRefSegTransforms:
-    def make_datapoints(self, supports_pil=True, image_dtype=torch.uint8):
-        size = (256, 460)
-        num_categories = 21
-
-        conv_fns = []
-        if supports_pil:
-            conv_fns.append(to_image_pil)
-        conv_fns.extend([torch.Tensor, lambda x: x])
-
-        for conv_fn in conv_fns:
-            feature_image = make_image(size=size, color_space=features.ColorSpace.RGB, dtype=image_dtype)
-            feature_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
-
-            dp = (conv_fn(feature_image), feature_mask)
-            dp_ref = (
-                to_image_pil(feature_image) if supports_pil else torch.Tensor(feature_image),
-                to_image_pil(feature_mask),
-            )
-
-            yield dp, dp_ref
-
-    def set_seed(self, seed=12):
-        torch.manual_seed(seed)
-        random.seed(seed)
-
-    def check(self, t, t_ref, data_kwargs=None):
-        for dp, dp_ref in self.make_datapoints(**data_kwargs or dict()):
-
-            self.set_seed()
-            output = t(dp)
-
-            self.set_seed()
-            expected_output = t_ref(*dp_ref)
-
-            assert_equal(output, expected_output)
-
-    @pytest.mark.parametrize(
-        ("t_ref", "t", "data_kwargs"),
-        [
-            (
-                seg_transforms.RandomHorizontalFlip(flip_prob=1.0),
-                prototype_transforms.RandomHorizontalFlip(p=1.0),
-                dict(),
-            ),
-            (
-                seg_transforms.RandomHorizontalFlip(flip_prob=0.0),
-                prototype_transforms.RandomHorizontalFlip(p=0.0),
-                dict(),
-            ),
-            (
-                seg_transforms.RandomCrop(size=480),
-                prototype_transforms.Compose(
-                    [
-                        PadIfSmaller(size=480, fill=defaultdict(lambda: 0, {features.Mask: 255})),
-                        prototype_transforms.RandomCrop(size=480),
-                    ]
-                ),
-                dict(),
-            ),
-            (
-                seg_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
-                prototype_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
-                dict(supports_pil=False, image_dtype=torch.float),
-            ),
-        ],
-    )
-    def test_common(self, t_ref, t, data_kwargs):
-        self.check(t, t_ref, data_kwargs)
-
-    def check_resize(self, mocker, t_ref, t):
-        mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize")
-        mock_ref = mocker.patch("torchvision.transforms.functional.resize")
-
-        for dp, dp_ref in self.make_datapoints():
-            mock.reset_mock()
-            mock_ref.reset_mock()
-
-            self.set_seed()
-            t(dp)
-            assert mock.call_count == 2
-            assert all(
-                actual is expected
-                for actual, expected in zip([call_args[0][0] for call_args in mock.call_args_list], dp)
-            )
-
-            self.set_seed()
-            t_ref(*dp_ref)
-            assert mock_ref.call_count == 2
-            assert all(
-                actual is expected
-                for actual, expected in zip([call_args[0][0] for call_args in mock_ref.call_args_list], dp_ref)
-            )
-
-            for args_kwargs, args_kwargs_ref in zip(mock.call_args_list, mock_ref.call_args_list):
-                assert args_kwargs[0][1] == [args_kwargs_ref[0][1]]
-
-    def test_random_resize_train(self, mocker):
-        base_size = 520
-        min_size = base_size // 2
-        max_size = base_size * 2
-
-        randint = torch.randint
-
-        def patched_randint(a, b, *other_args, **kwargs):
-            if kwargs or len(other_args) > 1 or other_args[0] != ():
-                return randint(a, b, *other_args, **kwargs)
-
-            return random.randint(a, b)
-
-        # We are patching torch.randint -> random.randint here, because we can't patch the modules that are not imported
-        # normally
-        t = prototype_transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
-        mocker.patch(
-            "torchvision.prototype.transforms._geometry.torch.randint",
-            new=patched_randint,
-        )
-
-        t_ref = seg_transforms.RandomResize(min_size=min_size, max_size=max_size)
-
-        self.check_resize(mocker, t_ref, t)
-
-    def test_random_resize_eval(self, mocker):
-        torch.manual_seed(0)
-        base_size = 520
-
-        t = prototype_transforms.Resize(size=base_size, antialias=True)
-
-        t_ref = seg_transforms.RandomResize(min_size=base_size, max_size=base_size)
-
-        self.check_resize(mocker, t_ref, t)
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
deleted file mode 100644
index b2c830d5d5f..00000000000
--- a/test/test_prototype_transforms_functional.py
+++ /dev/null
@@ -1,956 +0,0 @@
-import math
-import os
-
-import numpy as np
-import PIL.Image
-import pytest
-
-import torch
-from common_utils import cache, cpu_and_gpu, needs_cuda
-from prototype_common_utils import assert_close, make_bounding_boxes, make_image
-from prototype_transforms_dispatcher_infos import DISPATCHER_INFOS
-from prototype_transforms_kernel_infos import KERNEL_INFOS
-from torch.utils._pytree import tree_map
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F
-from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding
-from torchvision.prototype.transforms.functional._meta import convert_format_bounding_box
-from torchvision.transforms.functional import _get_perspective_coeffs
-
-
-@cache
-def script(fn):
-    try:
-        return torch.jit.script(fn)
-    except Exception as error:
-        raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
-
-
-@pytest.fixture(autouse=True)
-def maybe_skip(request):
-    # In case the test uses no parametrization or fixtures, the `callspec` attribute does not exist
-    try:
-        callspec = request.node.callspec
-    except AttributeError:
-        return
-
-    try:
-        info = callspec.params["info"]
-        args_kwargs = callspec.params["args_kwargs"]
-    except KeyError:
-        return
-
-    info.maybe_skip(
-        test_name=request.node.originalname, args_kwargs=args_kwargs, device=callspec.params.get("device", "cpu")
-    )
-
-
-class TestKernels:
-    sample_inputs = pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.kernel_name}-{idx}")
-            for info in KERNEL_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs_fn())
-        ],
-    )
-
-    @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_scripted_vs_eager(self, info, args_kwargs, device):
-        kernel_eager = info.kernel
-        kernel_scripted = script(kernel_eager)
-
-        args, kwargs = args_kwargs.load(device)
-
-        actual = kernel_scripted(*args, **kwargs)
-        expected = kernel_eager(*args, **kwargs)
-
-        assert_close(actual, expected, **info.closeness_kwargs)
-
-    def _unbatch(self, batch, *, data_dims):
-        if isinstance(batch, torch.Tensor):
-            batched_tensor = batch
-            metadata = ()
-        else:
-            batched_tensor, *metadata = batch
-
-        if batched_tensor.ndim == data_dims:
-            return batch
-
-        return [
-            self._unbatch(unbatched, data_dims=data_dims)
-            for unbatched in (
-                batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
-            )
-        ]
-
-    @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_batched_vs_single(self, info, args_kwargs, device):
-        (batched_input, *other_args), kwargs = args_kwargs.load(device)
-
-        feature_type = features.Image if features.is_simple_tensor(batched_input) else type(batched_input)
-        # This dictionary contains the number of rightmost dimensions that contain the actual data.
-        # Everything to the left is considered a batch dimension.
-        data_dims = {
-            features.Image: 3,
-            features.BoundingBox: 1,
-            # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks
-            # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one
-            # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as
-            # common ground.
-            features.Mask: 2,
-        }.get(feature_type)
-        if data_dims is None:
-            raise pytest.UsageError(
-                f"The number of data dimensions cannot be determined for input of type {feature_type.__name__}."
-            ) from None
-        elif batched_input.ndim <= data_dims:
-            pytest.skip("Input is not batched.")
-        elif not all(batched_input.shape[:-data_dims]):
-            pytest.skip("Input has a degenerate batch shape.")
-
-        batched_output = info.kernel(batched_input, *other_args, **kwargs)
-        actual = self._unbatch(batched_output, data_dims=data_dims)
-
-        single_inputs = self._unbatch(batched_input, data_dims=data_dims)
-        expected = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs)
-
-        assert_close(actual, expected, **info.closeness_kwargs)
-
-    @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_no_inplace(self, info, args_kwargs, device):
-        (input, *other_args), kwargs = args_kwargs.load(device)
-
-        if input.numel() == 0:
-            pytest.skip("The input has a degenerate shape.")
-
-        input_version = input._version
-        info.kernel(input, *other_args, **kwargs)
-
-        assert input._version == input_version
-
-    @sample_inputs
-    @needs_cuda
-    def test_cuda_vs_cpu(self, info, args_kwargs):
-        (input_cpu, *other_args), kwargs = args_kwargs.load("cpu")
-        input_cuda = input_cpu.to("cuda")
-
-        output_cpu = info.kernel(input_cpu, *other_args, **kwargs)
-        output_cuda = info.kernel(input_cuda, *other_args, **kwargs)
-
-        assert_close(output_cuda, output_cpu, check_device=False, **info.closeness_kwargs)
-
-    @sample_inputs
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_dtype_and_device_consistency(self, info, args_kwargs, device):
-        (input, *other_args), kwargs = args_kwargs.load(device)
-
-        output = info.kernel(input, *other_args, **kwargs)
-        # Most kernels just return a tensor, but some also return some additional metadata
-        if not isinstance(output, torch.Tensor):
-            output, *_ = output
-
-        assert output.dtype == input.dtype
-        assert output.device == input.device
-
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.kernel_name}-{idx}")
-            for info in KERNEL_INFOS
-            for idx, args_kwargs in enumerate(info.reference_inputs_fn())
-            if info.reference_fn is not None
-        ],
-    )
-    def test_against_reference(self, info, args_kwargs):
-        args, kwargs = args_kwargs.load("cpu")
-
-        actual = info.kernel(*args, **kwargs)
-        expected = info.reference_fn(*args, **kwargs)
-
-        assert_close(actual, expected, check_dtype=False, **info.closeness_kwargs)
-
-
-class TestDispatchers:
-    @pytest.mark.parametrize(
-        ("info", "args_kwargs"),
-        [
-            pytest.param(info, args_kwargs, id=f"{info.dispatcher.__name__}-{idx}")
-            for info in DISPATCHER_INFOS
-            for idx, args_kwargs in enumerate(info.sample_inputs(features.Image))
-            if features.Image in info.kernels
-        ],
-    )
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_scripted_smoke(self, info, args_kwargs, device):
-        dispatcher = script(info.dispatcher)
-
-        (image_feature, *other_args), kwargs = args_kwargs.load(device)
-        image_simple_tensor = torch.Tensor(image_feature)
-
-        dispatcher(image_simple_tensor, *other_args, **kwargs)
-
-    # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke`
-    #  replaces this test for them.
-    @pytest.mark.parametrize(
-        "dispatcher",
-        [
-            F.convert_color_space,
-            F.convert_image_dtype,
-            F.get_dimensions,
-            F.get_image_num_channels,
-            F.get_image_size,
-            F.get_spatial_size,
-            F.rgb_to_grayscale,
-        ],
-        ids=lambda dispatcher: dispatcher.__name__,
-    )
-    def test_scriptable(self, dispatcher):
-        script(dispatcher)
-
-
-@pytest.mark.parametrize(
-    ("alias", "target"),
-    [
-        pytest.param(alias, target, id=alias.__name__)
-        for alias, target in [
-            (F.hflip, F.horizontal_flip),
-            (F.vflip, F.vertical_flip),
-            (F.get_image_num_channels, F.get_num_channels),
-            (F.to_pil_image, F.to_image_pil),
-            (F.elastic_transform, F.elastic),
-        ]
-    ],
-)
-def test_alias(alias, target):
-    assert alias is target
-
-
-# TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
-#  `prototype_transforms_kernel_infos.py`
-
-
-def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
-    rot = math.radians(angle_)
-    cx, cy = center_
-    tx, ty = translate_
-    sx, sy = [math.radians(sh_) for sh_ in shear_]
-
-    c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
-    t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
-    c_matrix_inv = np.linalg.inv(c_matrix)
-    rs_matrix = np.array(
-        [
-            [scale_ * math.cos(rot), -scale_ * math.sin(rot), 0],
-            [scale_ * math.sin(rot), scale_ * math.cos(rot), 0],
-            [0, 0, 1],
-        ]
-    )
-    shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
-    shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
-    rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
-    true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
-    return true_matrix
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_affine_bounding_box_on_fixed_input(device):
-    # Check transformation against known expected output
-    image_size = (64, 64)
-    # xyxy format
-    in_boxes = [
-        [20, 25, 35, 45],
-        [50, 5, 70, 22],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
-        [1, 1, 5, 5],
-    ]
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
-    )
-    # Tested parameters
-    angle = 63
-    scale = 0.89
-    dx = 0.12
-    dy = 0.23
-
-    # Expected bboxes computed using albumentations:
-    # from albumentations.augmentations.geometric.functional import bbox_shift_scale_rotate
-    # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox
-    # expected_bboxes = []
-    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *image_size)
-    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size)
-    #     out_box = denormalize_bbox(n_out_box, *image_size)
-    #     expected_bboxes.append(out_box)
-    expected_bboxes = [
-        (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695),
-        (54.88288587110401, 50.08453280875634, 76.44484547743795, 72.81332520036864),
-        (27.709526487041554, 34.74952648704156, 51.650473512958435, 58.69047351295844),
-        (48.56528888843238, 9.611532109828834, 53.35347829361575, 14.39972151501221),
-    ]
-
-    output_boxes = F.affine_bounding_box(
-        in_boxes,
-        in_boxes.format,
-        in_boxes.image_size,
-        angle,
-        (dx * image_size[1], dy * image_size[0]),
-        scale,
-        shear=(0, 0),
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_affine_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees and scale
-    expected_mask = torch.rot90(mask, k=-1, dims=(-2, -1))
-    expected_mask = torch.nn.functional.interpolate(expected_mask[None, :].float(), size=(64, 64), mode="nearest")
-    expected_mask = expected_mask[0, :, 16 : 64 - 16, 16 : 64 - 16].long()
-
-    out_mask = F.affine_mask(mask, 90, [0.0, 0.0], 64.0 / 32.0, [0.0, 0.0])
-
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("angle", range(-90, 90, 56))
-@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
-def test_correctness_rotate_bounding_box(angle, expand, center):
-    def _compute_expected_bbox(bbox, angle_, expand_, center_):
-        affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
-        affine_matrix = affine_matrix[:2, :]
-
-        height, width = bbox.image_size
-        bbox_xyxy = convert_format_bounding_box(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-                # image frame
-                [0.0, 0.0, 1.0],
-                [0.0, height, 1.0],
-                [width, height, 1.0],
-                [width, 0.0, 1.0],
-            ]
-        )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = [
-            np.min(transformed_points[:4, 0]),
-            np.min(transformed_points[:4, 1]),
-            np.max(transformed_points[:4, 0]),
-            np.max(transformed_points[:4, 1]),
-        ]
-        if expand_:
-            tr_x = np.min(transformed_points[4:, 0])
-            tr_y = np.min(transformed_points[4:, 1])
-            out_bbox[0] -= tr_x
-            out_bbox[1] -= tr_y
-            out_bbox[2] -= tr_x
-            out_bbox[3] -= tr_y
-
-            height = int(height - 2 * tr_y)
-            width = int(width - 2 * tr_x)
-
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYXY,
-            image_size=(height, width),
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        return (
-            convert_format_bounding_box(
-                out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-            ),
-            (height, width),
-        )
-
-    image_size = (32, 38)
-
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_bboxes, output_image_size = F.rotate_bounding_box(
-            bboxes,
-            bboxes_format,
-            image_size=bboxes_image_size,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_image_size[::-1]]
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_)
-            expected_bboxes.append(expected_bbox)
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
-def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
-    # Check transformation against known expected output
-    image_size = (64, 64)
-    # xyxy format
-    in_boxes = [
-        [1, 1, 5, 5],
-        [1, image_size[0] - 6, 5, image_size[0] - 2],
-        [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
-    ]
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device
-    )
-    # Tested parameters
-    angle = 45
-    center = None if expand else [12, 23]
-
-    # # Expected bboxes computed using Detectron2:
-    # from detectron2.data.transforms import RotationTransform, AugmentationList
-    # from detectron2.data.transforms import AugInput
-    # import cv2
-    # inpt = AugInput(im1, boxes=np.array(in_boxes, dtype="float32"))
-    # augs = AugmentationList([RotationTransform(*size, angle, expand=expand, center=center, interp=cv2.INTER_NEAREST), ])
-    # out = augs(inpt)
-    # print(inpt.boxes)
-    if expand:
-        expected_bboxes = [
-            [1.65937957, 42.67157288, 7.31623382, 48.32842712],
-            [41.96446609, 82.9766594, 47.62132034, 88.63351365],
-            [82.26955262, 42.67157288, 87.92640687, 48.32842712],
-            [31.35786438, 31.35786438, 59.64213562, 59.64213562],
-        ]
-    else:
-        expected_bboxes = [
-            [-11.33452378, 12.39339828, -5.67766953, 18.05025253],
-            [28.97056275, 52.69848481, 34.627417, 58.35533906],
-            [69.27564928, 12.39339828, 74.93250353, 18.05025253],
-            [18.36396103, 1.07968978, 46.64823228, 29.36396103],
-        ]
-
-    output_boxes, _ = F.rotate_bounding_box(
-        in_boxes,
-        in_boxes.format,
-        in_boxes.image_size,
-        angle,
-        expand=expand,
-        center=center,
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees
-    expected_mask = torch.rot90(mask, k=1, dims=(-2, -1))
-    out_mask = F.rotate_mask(mask, 90, expand=False)
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "format",
-    [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH],
-)
-@pytest.mark.parametrize(
-    "top, left, height, width, expected_bboxes",
-    [
-        [8, 12, 30, 40, [(-2.0, 7.0, 13.0, 27.0), (38.0, -3.0, 58.0, 14.0), (33.0, 38.0, 44.0, 54.0)]],
-        [-8, 12, 70, 40, [(-2.0, 23.0, 13.0, 43.0), (38.0, 13.0, 58.0, 30.0), (33.0, 54.0, 44.0, 70.0)]],
-    ],
-)
-def test_correctness_crop_bounding_box(device, format, top, left, height, width, expected_bboxes):
-
-    # Expected bboxes computed using Albumentations:
-    # import numpy as np
-    # from albumentations.augmentations.crops.functional import crop_bbox_by_coords, normalize_bbox, denormalize_bbox
-    # expected_bboxes = []
-    # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *size)
-    #     n_out_box = crop_bbox_by_coords(
-    #         n_in_box, (left, top, left + width, top + height), height, width, *size
-    #     )
-    #     out_box = denormalize_bbox(n_out_box, height, width)
-    #     expected_bboxes.append(out_box)
-
-    size = (64, 76)
-    # xyxy format
-    in_boxes = [
-        [10.0, 15.0, 25.0, 35.0],
-        [50.0, 5.0, 70.0, 22.0],
-        [45.0, 46.0, 56.0, 62.0],
-    ]
-    in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device)
-    if format != features.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
-
-    output_boxes, output_image_size = F.crop_bounding_box(
-        in_boxes,
-        format,
-        top,
-        left,
-        size[0],
-        size[1],
-    )
-
-    if format != features.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device):
-    mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    mask[:, :, 0] = 1
-
-    out_mask = F.horizontal_flip_mask(mask)
-
-    expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    expected_mask[:, :, -1] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
-    mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    mask[:, 0, :] = 1
-
-    out_mask = F.vertical_flip_mask(mask)
-
-    expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
-    expected_mask[:, -1, :] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "format",
-    [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH],
-)
-@pytest.mark.parametrize(
-    "top, left, height, width, size",
-    [
-        [0, 0, 30, 30, (60, 60)],
-        [-5, 5, 35, 45, (32, 34)],
-    ],
-)
-def test_correctness_resized_crop_bounding_box(device, format, top, left, height, width, size):
-    def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
-        # bbox should be xyxy
-        bbox[0] = (bbox[0] - left_) * size_[1] / width_
-        bbox[1] = (bbox[1] - top_) * size_[0] / height_
-        bbox[2] = (bbox[2] - left_) * size_[1] / width_
-        bbox[3] = (bbox[3] - top_) * size_[0] / height_
-        return bbox
-
-    image_size = (100, 100)
-    # xyxy format
-    in_boxes = [
-        [10.0, 10.0, 20.0, 20.0],
-        [5.0, 10.0, 15.0, 20.0],
-    ]
-    expected_bboxes = []
-    for in_box in in_boxes:
-        expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
-    expected_bboxes = torch.tensor(expected_bboxes, device=device)
-
-    in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device
-    )
-    if format != features.BoundingBoxFormat.XYXY:
-        in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
-
-    output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
-
-    if format != features.BoundingBoxFormat.XYXY:
-        output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
-
-    torch.testing.assert_close(output_boxes, expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
-
-
-def _parse_padding(padding):
-    if isinstance(padding, int):
-        return [padding] * 4
-    if isinstance(padding, list):
-        if len(padding) == 1:
-            return padding * 4
-        if len(padding) == 2:
-            return padding * 2  # [left, up, right, down]
-
-    return padding
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]])
-def test_correctness_pad_bounding_box(device, padding):
-    def _compute_expected_bbox(bbox, padding_):
-        pad_left, pad_up, _, _ = _parse_padding(padding_)
-
-        bbox_format = bbox.format
-        bbox_dtype = bbox.dtype
-        bbox = convert_format_bounding_box(bbox, old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY)
-
-        bbox[0::2] += pad_left
-        bbox[1::2] += pad_up
-
-        bbox = convert_format_bounding_box(
-            bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False
-        )
-        if bbox.dtype != bbox_dtype:
-            # Temporary cast to original dtype
-            # e.g. float32 -> int
-            bbox = bbox.to(bbox_dtype)
-        return bbox
-
-    def _compute_expected_image_size(bbox, padding_):
-        pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
-        height, width = bbox.image_size
-        return height + pad_up + pad_down, width + pad_left + pad_right
-
-    for bboxes in make_bounding_boxes():
-        bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_boxes, output_image_size = F.pad_bounding_box(
-            bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding
-        )
-
-        torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding))
-
-        if bboxes.ndim < 2 or bboxes.shape[0] == 0:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, padding))
-
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-def test_correctness_pad_segmentation_mask_on_fixed_input(device):
-    mask = torch.ones((1, 3, 3), dtype=torch.long, device=device)
-
-    out_mask = F.pad_mask(mask, padding=[1, 1, 1, 1])
-
-    expected_mask = torch.zeros((1, 5, 5), dtype=torch.long, device=device)
-    expected_mask[:, 1:-1, 1:-1] = 1
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "startpoints, endpoints",
-    [
-        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
-        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
-    ],
-)
-def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
-    def _compute_expected_bbox(bbox, pcoeffs_):
-        m1 = np.array(
-            [
-                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
-                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
-            ]
-        )
-        m2 = np.array(
-            [
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-                [pcoeffs_[6], pcoeffs_[7], 1.0],
-            ]
-        )
-
-        bbox_xyxy = convert_format_bounding_box(
-            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-            ]
-        )
-        numer = np.matmul(points, m1.T)
-        denom = np.matmul(points, m2.T)
-        transformed_points = numer / denom
-        out_bbox = [
-            np.min(transformed_points[:, 0]),
-            np.min(transformed_points[:, 1]),
-            np.max(transformed_points[:, 0]),
-            np.max(transformed_points[:, 1]),
-        ]
-        out_bbox = features.BoundingBox(
-            np.array(out_bbox),
-            format=features.BoundingBoxFormat.XYXY,
-            image_size=bbox.image_size,
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        return convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
-        )
-
-    image_size = (32, 38)
-
-    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
-    inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
-
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
-        bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_bboxes = F.perspective_bounding_box(
-            bboxes,
-            bboxes_format,
-            perspective_coeffs=pcoeffs,
-        )
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=0, atol=1)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize(
-    "output_size",
-    [(18, 18), [18, 15], (16, 19), [12], [46, 48]],
-)
-def test_correctness_center_crop_bounding_box(device, output_size):
-    def _compute_expected_bbox(bbox, output_size_):
-        format_ = bbox.format
-        image_size_ = bbox.image_size
-        bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH)
-
-        if len(output_size_) == 1:
-            output_size_.append(output_size_[-1])
-
-        cy = int(round((image_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((image_size_[1] - output_size_[1]) * 0.5))
-        out_bbox = [
-            bbox[0].item() - cx,
-            bbox[1].item() - cy,
-            bbox[2].item(),
-            bbox[3].item(),
-        ]
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYWH,
-            image_size=output_size_,
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        return convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_, copy=False)
-
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
-        bboxes = bboxes.to(device)
-        bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.image_size
-
-        output_boxes, output_image_size = F.center_crop_bounding_box(
-            bboxes, bboxes_format, bboxes_image_size, output_size
-        )
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
-
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_boxes, expected_bboxes)
-        torch.testing.assert_close(output_image_size, output_size)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("output_size", [[4, 2], [4], [7, 6]])
-def test_correctness_center_crop_mask(device, output_size):
-    def _compute_expected_mask(mask, output_size):
-        crop_height, crop_width = output_size if len(output_size) > 1 else [output_size[0], output_size[0]]
-
-        _, image_height, image_width = mask.shape
-        if crop_width > image_height or crop_height > image_width:
-            padding = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-            mask = F.pad_image_tensor(mask, padding, fill=0)
-
-        left = round((image_width - crop_width) * 0.5)
-        top = round((image_height - crop_height) * 0.5)
-
-        return mask[:, top : top + crop_height, left : left + crop_width]
-
-    mask = torch.randint(0, 2, size=(1, 6, 6), dtype=torch.long, device=device)
-    actual = F.center_crop_mask(mask, output_size)
-
-    expected = _compute_expected_mask(mask, output_size)
-    torch.testing.assert_close(expected, actual)
-
-
-# Copied from test/test_functional_tensor.py
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("image_size", ("small", "large"))
-@pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
-@pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma):
-    fn = F.gaussian_blur_image_tensor
-
-    # true_cv2_results = {
-    #     # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8)
-    #     "3_3_0.8": ...
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5)
-    #     "3_3_0.5": ...
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8)
-    #     "3_5_0.8": ...
-    #     # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5)
-    #     "3_5_0.5": ...
-    #     # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28))
-    #     # cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7)
-    #     "23_23_1.7": ...
-    # }
-    p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
-    true_cv2_results = torch.load(p)
-
-    if image_size == "small":
-        tensor = (
-            torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
-        )
-    else:
-        tensor = torch.from_numpy(np.arange(26 * 28, dtype="uint8").reshape((1, 26, 28))).to(device)
-
-    if dt == torch.float16 and device == "cpu":
-        # skip float16 on CPU case
-        return
-
-    if dt is not None:
-        tensor = tensor.to(dtype=dt)
-
-    _ksize = (ksize, ksize) if isinstance(ksize, int) else ksize
-    _sigma = sigma[0] if sigma is not None else None
-    shape = tensor.shape
-    gt_key = f"{shape[-2]}_{shape[-1]}_{shape[-3]}__{_ksize[0]}_{_ksize[1]}_{_sigma}"
-    if gt_key not in true_cv2_results:
-        return
-
-    true_out = (
-        torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor)
-    )
-
-    image = features.Image(tensor)
-
-    out = fn(image, kernel_size=ksize, sigma=sigma)
-    torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
-
-
-def test_normalize_output_type():
-    inpt = torch.rand(1, 3, 32, 32)
-    output = F.normalize(inpt, mean=[0.5, 0.5, 0.5], std=[1.0, 1.0, 1.0])
-    assert type(output) is torch.Tensor
-    torch.testing.assert_close(inpt - 0.5, output)
-
-    inpt = make_image(color_space=features.ColorSpace.RGB)
-    output = F.normalize(inpt, mean=[0.5, 0.5, 0.5], std=[1.0, 1.0, 1.0])
-    assert type(output) is torch.Tensor
-    torch.testing.assert_close(inpt - 0.5, output)
-
-
-@pytest.mark.parametrize(
-    "inpt",
-    [
-        127 * np.ones((32, 32, 3), dtype="uint8"),
-        PIL.Image.new("RGB", (32, 32), 122),
-    ],
-)
-def test_to_image_tensor(inpt):
-    output = F.to_image_tensor(inpt)
-    assert isinstance(output, torch.Tensor)
-
-    assert np.asarray(inpt).sum() == output.sum().item()
-
-    if isinstance(inpt, PIL.Image.Image):
-        # we can't check this option
-        # as PIL -> numpy is always copying
-        return
-
-    inpt[0, 0, 0] = 11
-    assert output[0, 0, 0] == 11
-
-
-@pytest.mark.parametrize(
-    "inpt",
-    [
-        torch.randint(0, 256, size=(3, 32, 32), dtype=torch.uint8),
-        127 * np.ones((32, 32, 3), dtype="uint8"),
-    ],
-)
-@pytest.mark.parametrize("mode", [None, "RGB"])
-def test_to_image_pil(inpt, mode):
-    output = F.to_image_pil(inpt, mode=mode)
-    assert isinstance(output, PIL.Image.Image)
-
-    assert np.asarray(inpt).sum() == np.asarray(output).sum()
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
deleted file mode 100644
index 9a8ed67dde2..00000000000
--- a/test/test_prototype_transforms_utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import PIL.Image
-import pytest
-
-import torch
-
-from prototype_common_utils import make_bounding_box, make_detection_mask, make_image
-
-from torchvision.prototype import features
-from torchvision.prototype.transforms._utils import has_all, has_any
-from torchvision.prototype.transforms.functional import to_image_pil
-
-
-IMAGE = make_image(color_space=features.ColorSpace.RGB)
-BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size)
-MASK = make_detection_mask(size=IMAGE.image_size)
-
-
-@pytest.mark.parametrize(
-    ("sample", "types", "expected"),
-    [
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.BoundingBox), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox, features.Mask), True),
-        ((MASK,), (features.Image, features.BoundingBox), False),
-        ((BOUNDING_BOX,), (features.Image, features.Mask), False),
-        ((IMAGE,), (features.BoundingBox, features.Mask), False),
-        (
-            (IMAGE, BOUNDING_BOX, MASK),
-            (features.Image, features.BoundingBox, features.Mask),
-            True,
-        ),
-        ((), (features.Image, features.BoundingBox, features.Mask), False),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, features.Image),), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (features.Image, PIL.Image.Image, features.is_simple_tensor), True),
-        ((torch.Tensor(IMAGE),), (features.Image, PIL.Image.Image, features.is_simple_tensor), True),
-        ((to_image_pil(IMAGE),), (features.Image, PIL.Image.Image, features.is_simple_tensor), True),
-    ],
-)
-def test_has_any(sample, types, expected):
-    assert has_any(sample, *types) is expected
-
-
-@pytest.mark.parametrize(
-    ("sample", "types", "expected"),
-    [
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Mask,), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.BoundingBox), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.Image, features.Mask), True),
-        ((IMAGE, BOUNDING_BOX, MASK), (features.BoundingBox, features.Mask), True),
-        (
-            (IMAGE, BOUNDING_BOX, MASK),
-            (features.Image, features.BoundingBox, features.Mask),
-            True,
-        ),
-        ((BOUNDING_BOX, MASK), (features.Image, features.BoundingBox), False),
-        ((BOUNDING_BOX, MASK), (features.Image, features.Mask), False),
-        ((IMAGE, MASK), (features.BoundingBox, features.Mask), False),
-        (
-            (IMAGE, BOUNDING_BOX, MASK),
-            (features.Image, features.BoundingBox, features.Mask),
-            True,
-        ),
-        ((BOUNDING_BOX, MASK), (features.Image, features.BoundingBox, features.Mask), False),
-        ((IMAGE, MASK), (features.Image, features.BoundingBox, features.Mask), False),
-        ((IMAGE, BOUNDING_BOX), (features.Image, features.BoundingBox, features.Mask), False),
-        (
-            (IMAGE, BOUNDING_BOX, MASK),
-            (lambda obj: isinstance(obj, (features.Image, features.BoundingBox, features.Mask)),),
-            True,
-        ),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
-        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-    ],
-)
-def test_has_all(sample, types, expected):
-    assert has_all(sample, *types) is expected
diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py
deleted file mode 100644
index bef5ecc411d..00000000000
--- a/torchvision/prototype/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import datasets, features, models, transforms, utils
diff --git a/torchvision/prototype/datasets/__init__.py b/torchvision/prototype/datasets/__init__.py
deleted file mode 100644
index 848d9135c2f..00000000000
--- a/torchvision/prototype/datasets/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-try:
-    import torchdata
-except ModuleNotFoundError:
-    raise ModuleNotFoundError(
-        "`torchvision.prototype.datasets` depends on PyTorch's `torchdata` (https://github.com/pytorch/data). "
-        "You can install it with `pip install --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu"
-    ) from None
-
-from . import utils
-from ._home import home
-
-# Load this last, since some parts depend on the above being loaded first
-from ._api import list_datasets, info, load, register_info, register_dataset  # usort: skip
-from ._folder import from_data_folder, from_image_folder
-from ._builtin import *
diff --git a/torchvision/prototype/datasets/_api.py b/torchvision/prototype/datasets/_api.py
deleted file mode 100644
index f6f06c60a21..00000000000
--- a/torchvision/prototype/datasets/_api.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import pathlib
-from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
-
-from torchvision.prototype.datasets import home
-from torchvision.prototype.datasets.utils import Dataset
-from torchvision.prototype.utils._internal import add_suggestion
-
-
-T = TypeVar("T")
-D = TypeVar("D", bound=Type[Dataset])
-
-BUILTIN_INFOS: Dict[str, Dict[str, Any]] = {}
-
-
-def register_info(name: str) -> Callable[[Callable[[], Dict[str, Any]]], Callable[[], Dict[str, Any]]]:
-    def wrapper(fn: Callable[[], Dict[str, Any]]) -> Callable[[], Dict[str, Any]]:
-        BUILTIN_INFOS[name] = fn()
-        return fn
-
-    return wrapper
-
-
-BUILTIN_DATASETS = {}
-
-
-def register_dataset(name: str) -> Callable[[D], D]:
-    def wrapper(dataset_cls: D) -> D:
-        BUILTIN_DATASETS[name] = dataset_cls
-        return dataset_cls
-
-    return wrapper
-
-
-def list_datasets() -> List[str]:
-    return sorted(BUILTIN_DATASETS.keys())
-
-
-def find(dct: Dict[str, T], name: str) -> T:
-    name = name.lower()
-    try:
-        return dct[name]
-    except KeyError as error:
-        raise ValueError(
-            add_suggestion(
-                f"Unknown dataset '{name}'.",
-                word=name,
-                possibilities=dct.keys(),
-                alternative_hint=lambda _: (
-                    "You can use torchvision.datasets.list_datasets() to get a list of all available datasets."
-                ),
-            )
-        ) from error
-
-
-def info(name: str) -> Dict[str, Any]:
-    return find(BUILTIN_INFOS, name)
-
-
-def load(name: str, *, root: Optional[Union[str, pathlib.Path]] = None, **config: Any) -> Dataset:
-    dataset_cls = find(BUILTIN_DATASETS, name)
-
-    if root is None:
-        root = pathlib.Path(home()) / name
-
-    return dataset_cls(root, **config)
diff --git a/torchvision/prototype/datasets/_builtin/README.md b/torchvision/prototype/datasets/_builtin/README.md
deleted file mode 100644
index 05d61c6870e..00000000000
--- a/torchvision/prototype/datasets/_builtin/README.md
+++ /dev/null
@@ -1,340 +0,0 @@
-# How to add new built-in prototype datasets
-
-As the name implies, the datasets are still in a prototype state and thus subject to rapid change. This in turn means
-that this document will also change a lot.
-
-If you hit a blocker while adding a dataset, please have a look at another similar dataset to see how it is implemented
-there. If you can't resolve it yourself, feel free to send a draft PR in order for us to help you out.
-
-Finally, `from torchvision.prototype import datasets` is implied below.
-
-## Implementation
-
-Before we start with the actual implementation, you should create a module in `torchvision/prototype/datasets/_builtin`
-that hints at the dataset you are going to add. For example `caltech.py` for `caltech101` and `caltech256`. In that
-module create a class that inherits from `datasets.utils.Dataset` and overwrites four methods that will be discussed in
-detail below:
-
-```python
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
-
-from torchdata.datapipes.iter import IterDataPipe
-from torchvision.prototype.datasets.utils import Dataset, OnlineResource
-
-from .._api import register_dataset, register_info
-
-NAME = "my-dataset"
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-  return dict(
-      ...
-  )
-
-@register_dataset(NAME)
-class MyDataset(Dataset):
-    def __init__(self, root: Union[str, pathlib.Path], *, ..., skip_integrity_check: bool = False) -> None:
-        ...
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        ...
-
-    def _datapipe(self, resource_dps: List[IterDataPipe[Tuple[str, BinaryIO]]]) -> IterDataPipe[Dict[str, Any]]:
-        ...
-
-    def __len__(self) -> int:
-        ...
-```
-
-In addition to the dataset, you also need to implement an `_info()` function that takes no arguments and returns a
-dictionary of static information. The most common use case is to provide human-readable categories.
-[See below](#how-do-i-handle-a-dataset-that-defines-many-categories) how to handle cases with many categories.
-
-Finally, both the dataset class and the info function need to be registered on the API with the respective decorators.
-With that they are loadable through `datasets.load("my-dataset")` and `datasets.info("my-dataset")`, respectively.
-
-### `__init__(self, root, *, ..., skip_integrity_check = False)`
-
-Constructor of the dataset that will be called when the dataset is instantiated. In addition to the parameters of the
-base class, it can take arbitrary keyword-only parameters with defaults. The checking of these parameters as well as
-setting them as instance attributes has to happen before the call of `super().__init__(...)`, because that will invoke
-the other methods, which possibly depend on the parameters. All instance attributes must be private, i.e. prefixed with
-an underscore.
-
-If the implementation of the dataset depends on third-party packages, pass them as a collection of strings to the base
-class constructor, e.g. `super().__init__(..., dependencies=("scipy",))`. Their availability will be automatically
-checked if a user tries to load the dataset. Within the implementation of the dataset, import these packages lazily to
-avoid missing dependencies at import time.
-
-### `_resources(self)`
-
-Returns `List[datasets.utils.OnlineResource]` of all the files that need to be present locally before the dataset can be
-build. The download will happen automatically.
-
-Currently, the following `OnlineResource`'s are supported:
-
-- `HttpResource`: Used for files that are directly exposed through HTTP(s) and only requires the URL.
-- `GDriveResource`: Used for files that are hosted on GDrive and requires the GDrive ID as well as the `file_name`.
-- `ManualDownloadResource`: Used files are not publicly accessible and requires instructions how to download them
-  manually. If the file does not exist, an error will be raised with the supplied instructions.
-- `KaggleDownloadResource`: Used for files that are available on Kaggle. This inherits from `ManualDownloadResource`.
-
-Although optional in general, all resources used in the built-in datasets should comprise
-[SHA256](https://en.wikipedia.org/wiki/SHA-2) checksum for security. It will be automatically checked after the
-download. You can compute the checksum with system utilities e.g `sha256-sum`, or this snippet:
-
-```python
-import hashlib
-
-def sha256sum(path, chunk_size=1024 * 1024):
-    checksum = hashlib.sha256()
-    with open(path, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
-            checksum.update(chunk)
-    print(checksum.hexdigest())
-```
-
-### `_datapipe(self, resource_dps)`
-
-This method is the heart of the dataset, where we transform the raw data into a usable form. A major difference compared
-to the current stable datasets is that everything is performed through `IterDataPipe`'s. From the perspective of someone
-that is working with them rather than on them, `IterDataPipe`'s behave just as generators, i.e. you can't do anything
-with them besides iterating.
-
-Of course, there are some common building blocks that should suffice in 95% of the cases. The most used are:
-
-- `Mapper`: Apply a callable to every item in the datapipe.
-- `Filter`: Keep only items that satisfy a condition.
-- `Demultiplexer`: Split a datapipe into multiple ones.
-- `IterKeyZipper`: Merge two datapipes into one.
-
-All of them can be imported `from torchdata.datapipes.iter`. In addition, use `functools.partial` in case a callable
-needs extra arguments. If the provided `IterDataPipe`'s are not sufficient for the use case, it is also not complicated
-to add one. See the MNIST or CelebA datasets for example.
-
-`_datapipe()` receives `resource_dps`, which is a list of datapipes that has a 1-to-1 correspondence with the return
-value of `_resources()`. In case of archives with regular suffixes (`.tar`, `.zip`, ...), the datapipe will contain
-tuples comprised of the path and the handle for every file in the archive. Otherwise, the datapipe will only contain one
-of such tuples for the file specified by the resource.
-
-Since the datapipes are iterable in nature, some datapipes feature an in-memory buffer, e.g. `IterKeyZipper` and
-`Grouper`. There are two issues with that:
-
-1. If not used carefully, this can easily overflow the host memory, since most datasets will not fit in completely.
-2. This can lead to unnecessarily long warm-up times when data is buffered that is only needed at runtime.
-
-Thus, all buffered datapipes should be used as early as possible, e.g. zipping two datapipes of file handles rather than
-trying to zip already loaded images.
-
-There are two special datapipes that are not used through their class, but through the functions `hint_shuffling` and
-`hint_sharding`. As the name implies they only hint at a location in the datapipe graph where shuffling and sharding
-should take place, but are no-ops by default. They can be imported from `torchvision.prototype.datasets.utils._internal`
-and are required in each dataset. `hint_shuffling` has to be placed before `hint_sharding`.
-
-Finally, each item in the final datapipe should be a dictionary with `str` keys. There is no standardization of the
-names (yet!).
-
-### `__len__`
-
-This returns an integer denoting the number of samples that can be drawn from the dataset. Please use
-[underscores](https://peps.python.org/pep-0515/) after every three digits starting from the right to enhance the
-readability. For example, `1_281_167` vs. `1281167`.
-
-If there are only two different numbers, a simple `if` / `else` is fine:
-
-```py
-def __len__(self):
-    return 12_345 if self._split == "train" else 6_789
-```
-
-If there are more options, using a dictionary usually is the most readable option:
-
-```py
-def __len__(self):
-    return {
-        "train": 3,
-        "val": 2,
-        "test": 1,
-    }[self._split]
-```
-
-If the number of samples depends on more than one parameter, you can use tuples as dictionary keys:
-
-```py
-def __len__(self):
-    return {
-        ("train", "bar"): 4,
-        ("train", "baz"): 3,
-        ("test", "bar"): 2,
-        ("test", "baz"): 1,
-    }[(self._split, self._foo)]
-```
-
-The length of the datapipe is only an annotation for subsequent processing of the datapipe and not needed during the
-development process. Since it is an `@abstractmethod` you still have to implement it from the start. The canonical way
-is to define a dummy method like
-
-```py
-def __len__(self):
-    return 1
-```
-
-and only fill it with the correct data if the implementation is otherwise finished.
-[See below](#how-do-i-compute-the-number-of-samples) for a possible way to compute the number of samples.
-
-## Tests
-
-To test the dataset implementation, you usually don't need to add any tests, but need to provide a mock-up of the data.
-This mock-up should resemble the original data as close as necessary, while containing only few examples.
-
-To do this, add a new function in [`test/builtin_dataset_mocks.py`](../../../../test/builtin_dataset_mocks.py) with the
-same name as you have used in `@register_info` and `@register_dataset`. This function is called "mock data function".
-Decorate it with `@register_mock(configs=[dict(...), ...])`. Each dictionary denotes one configuration that the dataset
-will be loaded with, e.g. `datasets.load("my-dataset", **config)`. For the most common case of a product of all options,
-you can use the `combinations_grid()` helper function, e.g.
-`configs=combinations_grid(split=("train", "test"), foo=("bar", "baz"))`.
-
-In case the name of the dataset includes hyphens `-`, replace them with underscores `_` in the function name and pass
-the `name` parameter to `@register_mock`
-
-```py
-# this is defined in torchvision/prototype/datasets/_builtin
-@register_dataset("my-dataset")
-class MyDataset(Dataset):
-    ...
-
-@register_mock(name="my-dataset", configs=...)
-def my_dataset(root, config):
-    ...
-```
-
-The mock data function receives two arguments:
-
-- `root`: A [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html#pathlib.Path) of a folder, in which the data
-  needs to be placed.
-- `config`: The configuration to generate the data for. This is one of the dictionaries defined in
-  `@register_mock(configs=...)`
-
-The function should generate all files that are needed for the current `config`. Each file should be complete, e.g. if
-the dataset only has a single archive that contains multiple splits, you need to generate the full archive regardless of
-the current `config`. Although this seems odd at first, this is important. Consider the following original data setup:
-
-```
-root
-├── test
-│   ├── test_image0.jpg
-│   ...
-└── train
-    ├── train_image0.jpg
-    ...
-```
-
-For map-style datasets (like the one currently in `torchvision.datasets`), one explicitly selects the files they want to
-load. For example, something like `(root / split).iterdir()` works fine even if only the specific split folder is
-present. With iterable-style datasets though, we get something like `root.iterdir()` from `resource_dps` in
-`_datapipe()` and need to manually `Filter` it to only keep the files we want. If we would only generate the data for
-the current `config`, the test would also pass if the dataset is missing the filtering, but would fail on the real data.
-
-For datasets that are ported from the old API, we already have some mock data in
-[`test/test_datasets.py`](../../../../test/test_datasets.py). You can find the test case corresponding test case there
-and have a look at the `inject_fake_data` function. There are a few differences though:
-
-- `tmp_dir` corresponds to `root`, but is a `str` rather than a
-  [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html#pathlib.Path). Thus, you often see something like
-  `folder = pathlib.Path(tmp_dir)`. This is not needed.
-- The data generated by `inject_fake_data` was supposed to be in an extracted state. This is no longer the case for the
-  new mock-ups. Thus, you need to use helper functions like `make_zip` or `make_tar` to actually generate the files
-  specified in the dataset.
-- As explained in the paragraph above, the generated data is often "incomplete" and only valid for given the config.
-  Make sure you follow the instructions above.
-
-The function should return an integer indicating the number of samples in the dataset for the current `config`.
-Preferably, this number should be different for different `config`'s to have more confidence in the dataset
-implementation.
-
-Finally, you can run the tests with `pytest test/test_prototype_builtin_datasets.py -k {name}`.
-
-## FAQ
-
-### How do I start?
-
-Get the skeleton of your dataset class ready with all 4 methods. For `_datapipe()`, you can just do
-`return resources_dp[0]` to get started. Then import the dataset class in
-`torchvision/prototype/datasets/_builtin/__init__.py`: this will automatically register the dataset, and it will be
-instantiable via `datasets.load("mydataset")`. On a separate script, try something like
-
-```py
-from torchvision.prototype import datasets
-
-dataset = datasets.load("mydataset")
-for sample in dataset:
-    print(sample)  # this is the content of an item in datapipe returned by _datapipe()
-    break
-# Or you can also inspect the sample in a debugger
-```
-
-This will give you an idea of what the first datapipe in `resources_dp` contains. You can also do that with
-`resources_dp[1]` or `resources_dp[2]` (etc.) if they exist. Then follow the instructions above to manipulate these
-datapipes and return the appropriate dictionary format.
-
-### How do I handle a dataset that defines many categories?
-
-As a rule of thumb, `categories` in the info dictionary should only be set manually for ten categories or fewer. If more
-categories are needed, you can add a `$NAME.categories` file to the `_builtin` folder in which each line specifies a
-category. To load such a file, use the `from torchvision.prototype.datasets.utils._internal import read_categories_file`
-function and pass it `$NAME`.
-
-In case the categories can be generated from the dataset files, e.g. the dataset follows an image folder approach where
-each folder denotes the name of the category, the dataset can overwrite the `_generate_categories` method. The method
-should return a sequence of strings representing the category names. In the method body, you'll have to manually load
-the resources, e.g.
-
-```py
-resources = self._resources()
-dp = resources[0].load(self._root)
-```
-
-Note that it is not necessary here to keep a datapipe until the final step. Stick with datapipes as long as it makes
-sense and afterwards materialize the data with `next(iter(dp))` or `list(dp)` and proceed with that.
-
-To generate the `$NAME.categories` file, run `python -m torchvision.prototype.datasets.generate_category_files $NAME`.
-
-### What if a resource file forms an I/O bottleneck?
-
-In general, we are ok with small performance hits of iterating archives rather than their extracted content. However, if
-the performance hit becomes significant, the archives can still be preprocessed. `OnlineResource` accepts the
-`preprocess` parameter that can be a `Callable[[pathlib.Path], pathlib.Path]` where the input points to the file to be
-preprocessed and the return value should be the result of the preprocessing to load. For convenience, `preprocess` also
-accepts `"decompress"` and `"extract"` to handle these common scenarios.
-
-### How do I compute the number of samples?
-
-Unless the authors of the dataset published the exact numbers (even in this case we should check), there is no other way
-than to iterate over the dataset and count the number of samples:
-
-```py
-import itertools
-from torchvision.prototype import datasets
-
-
-def combinations_grid(**kwargs):
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
-# If you have implemented the mock data function for the dataset tests, you can simply copy-paste from there
-configs = combinations_grid(split=("train", "test"), foo=("bar", "baz"))
-
-for config in configs:
-    dataset = datasets.load("my-dataset", **config)
-
-    num_samples = 0
-    for _ in dataset:
-        num_samples += 1
-
-    print(", ".join(f"{key}={value}" for key, value in config.items()), num_samples)
-```
-
-To speed this up, it is useful to temporarily comment out all unnecessary I/O, such as loading of images or annotation
-files.
diff --git a/torchvision/prototype/datasets/_builtin/__init__.py b/torchvision/prototype/datasets/_builtin/__init__.py
deleted file mode 100644
index d84e9af9fc4..00000000000
--- a/torchvision/prototype/datasets/_builtin/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from .caltech import Caltech101, Caltech256
-from .celeba import CelebA
-from .cifar import Cifar10, Cifar100
-from .clevr import CLEVR
-from .coco import Coco
-from .country211 import Country211
-from .cub200 import CUB200
-from .dtd import DTD
-from .eurosat import EuroSAT
-from .fer2013 import FER2013
-from .food101 import Food101
-from .gtsrb import GTSRB
-from .imagenet import ImageNet
-from .mnist import EMNIST, FashionMNIST, KMNIST, MNIST, QMNIST
-from .oxford_iiit_pet import OxfordIIITPet
-from .pcam import PCAM
-from .sbd import SBD
-from .semeion import SEMEION
-from .stanford_cars import StanfordCars
-from .svhn import SVHN
-from .usps import USPS
-from .voc import VOC
diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py
deleted file mode 100644
index a00bf2e2cc9..00000000000
--- a/torchvision/prototype/datasets/_builtin/caltech.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import pathlib
-import re
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
-
-import numpy as np
-from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-
-@register_info("caltech101")
-def _caltech101_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("caltech101"))
-
-
-@register_dataset("caltech101")
-class Caltech101(Dataset):
-    """
-    - **homepage**: https://data.caltech.edu/records/20086
-    - **dependencies**:
-        - <scipy `https://scipy.org/`>_
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._categories = _caltech101_info()["categories"]
-
-        super().__init__(
-            root,
-            dependencies=("scipy",),
-            skip_integrity_check=skip_integrity_check,
-        )
-
-    def _resources(self) -> List[OnlineResource]:
-        images = GDriveResource(
-            "137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp",
-            file_name="101_ObjectCategories.tar.gz",
-            sha256="af6ece2f339791ca20f855943d8b55dd60892c0a25105fcd631ee3d6430f9926",
-            preprocess="decompress",
-        )
-        anns = GDriveResource(
-            "175kQy3UsZ0wUEHZjqkUDdNVssr7bgh_m",
-            file_name="Annotations.tar",
-            sha256="1717f4e10aa837b05956e3f4c94456527b143eec0d95e935028b30aff40663d8",
-        )
-        return [images, anns]
-
-    _IMAGES_NAME_PATTERN = re.compile(r"image_(?P<id>\d+)[.]jpg")
-    _ANNS_NAME_PATTERN = re.compile(r"annotation_(?P<id>\d+)[.]mat")
-    _ANNS_CATEGORY_MAP = {
-        "Faces_2": "Faces",
-        "Faces_3": "Faces_easy",
-        "Motorbikes_16": "Motorbikes",
-        "Airplanes_Side_2": "airplanes",
-    }
-
-    def _is_not_background_image(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.parent.name != "BACKGROUND_Google"
-
-    def _is_ann(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return bool(self._ANNS_NAME_PATTERN.match(path.name))
-
-    def _images_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
-        path = pathlib.Path(data[0])
-
-        category = path.parent.name
-        id = self._IMAGES_NAME_PATTERN.match(path.name).group("id")  # type: ignore[union-attr]
-
-        return category, id
-
-    def _anns_key_fn(self, data: Tuple[str, Any]) -> Tuple[str, str]:
-        path = pathlib.Path(data[0])
-
-        category = path.parent.name
-        if category in self._ANNS_CATEGORY_MAP:
-            category = self._ANNS_CATEGORY_MAP[category]
-
-        id = self._ANNS_NAME_PATTERN.match(path.name).group("id")  # type: ignore[union-attr]
-
-        return category, id
-
-    def _prepare_sample(
-        self, data: Tuple[Tuple[str, str], Tuple[Tuple[str, BinaryIO], Tuple[str, BinaryIO]]]
-    ) -> Dict[str, Any]:
-        key, (image_data, ann_data) = data
-        category, _ = key
-        image_path, image_buffer = image_data
-        ann_path, ann_buffer = ann_data
-
-        image = EncodedImage.from_file(image_buffer)
-        ann = read_mat(ann_buffer)
-
-        return dict(
-            label=Label.from_category(category, categories=self._categories),
-            image_path=image_path,
-            image=image,
-            ann_path=ann_path,
-            bounding_box=BoundingBox(
-                ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", image_size=image.image_size
-            ),
-            contour=_Feature(ann["obj_contour"].T),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, anns_dp = resource_dps
-
-        images_dp = Filter(images_dp, self._is_not_background_image)
-        images_dp = hint_shuffling(images_dp)
-        images_dp = hint_sharding(images_dp)
-
-        anns_dp = Filter(anns_dp, self._is_ann)
-
-        dp = IterKeyZipper(
-            images_dp,
-            anns_dp,
-            key_fn=self._images_key_fn,
-            ref_key_fn=self._anns_key_fn,
-            buffer_size=INFINITE_BUFFER_SIZE,
-            keep_key=True,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 8677
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, self._is_not_background_image)
-
-        return sorted({pathlib.Path(path).parent.name for path, _ in dp})
-
-
-@register_info("caltech256")
-def _caltech256_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("caltech256"))
-
-
-@register_dataset("caltech256")
-class Caltech256(Dataset):
-    """
-    - **homepage**: https://data.caltech.edu/records/20087
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._categories = _caltech256_info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            GDriveResource(
-                "1r6o0pSROcV1_VwT4oSjA2FBUSCWGuxLK",
-                file_name="256_ObjectCategories.tar",
-                sha256="08ff01b03c65566014ae88eb0490dbe4419fc7ac4de726ee1163e39fd809543e",
-            )
-        ]
-
-    def _is_not_rogue_file(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.name != "RENAME2"
-
-    def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
-        path, buffer = data
-
-        return dict(
-            path=path,
-            image=EncodedImage.from_file(buffer),
-            label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, self._is_not_rogue_file)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 30607
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dir_names = {pathlib.Path(path).parent.name for path, _ in dp}
-
-        return [name.split(".")[1] for name in sorted(dir_names)]
diff --git a/torchvision/prototype/datasets/_builtin/caltech101.categories b/torchvision/prototype/datasets/_builtin/caltech101.categories
deleted file mode 100644
index d5c18654b4e..00000000000
--- a/torchvision/prototype/datasets/_builtin/caltech101.categories
+++ /dev/null
@@ -1,101 +0,0 @@
-Faces
-Faces_easy
-Leopards
-Motorbikes
-accordion
-airplanes
-anchor
-ant
-barrel
-bass
-beaver
-binocular
-bonsai
-brain
-brontosaurus
-buddha
-butterfly
-camera
-cannon
-car_side
-ceiling_fan
-cellphone
-chair
-chandelier
-cougar_body
-cougar_face
-crab
-crayfish
-crocodile
-crocodile_head
-cup
-dalmatian
-dollar_bill
-dolphin
-dragonfly
-electric_guitar
-elephant
-emu
-euphonium
-ewer
-ferry
-flamingo
-flamingo_head
-garfield
-gerenuk
-gramophone
-grand_piano
-hawksbill
-headphone
-hedgehog
-helicopter
-ibis
-inline_skate
-joshua_tree
-kangaroo
-ketch
-lamp
-laptop
-llama
-lobster
-lotus
-mandolin
-mayfly
-menorah
-metronome
-minaret
-nautilus
-octopus
-okapi
-pagoda
-panda
-pigeon
-pizza
-platypus
-pyramid
-revolver
-rhino
-rooster
-saxophone
-schooner
-scissors
-scorpion
-sea_horse
-snoopy
-soccer_ball
-stapler
-starfish
-stegosaurus
-stop_sign
-strawberry
-sunflower
-tick
-trilobite
-umbrella
-watch
-water_lilly
-wheelchair
-wild_cat
-windsor_chair
-wrench
-yin_yang
diff --git a/torchvision/prototype/datasets/_builtin/caltech256.categories b/torchvision/prototype/datasets/_builtin/caltech256.categories
deleted file mode 100644
index 82128efba97..00000000000
--- a/torchvision/prototype/datasets/_builtin/caltech256.categories
+++ /dev/null
@@ -1,257 +0,0 @@
-ak47
-american-flag
-backpack
-baseball-bat
-baseball-glove
-basketball-hoop
-bat
-bathtub
-bear
-beer-mug
-billiards
-binoculars
-birdbath
-blimp
-bonsai-101
-boom-box
-bowling-ball
-bowling-pin
-boxing-glove
-brain-101
-breadmaker
-buddha-101
-bulldozer
-butterfly
-cactus
-cake
-calculator
-camel
-cannon
-canoe
-car-tire
-cartman
-cd
-centipede
-cereal-box
-chandelier-101
-chess-board
-chimp
-chopsticks
-cockroach
-coffee-mug
-coffin
-coin
-comet
-computer-keyboard
-computer-monitor
-computer-mouse
-conch
-cormorant
-covered-wagon
-cowboy-hat
-crab-101
-desk-globe
-diamond-ring
-dice
-dog
-dolphin-101
-doorknob
-drinking-straw
-duck
-dumb-bell
-eiffel-tower
-electric-guitar-101
-elephant-101
-elk
-ewer-101
-eyeglasses
-fern
-fighter-jet
-fire-extinguisher
-fire-hydrant
-fire-truck
-fireworks
-flashlight
-floppy-disk
-football-helmet
-french-horn
-fried-egg
-frisbee
-frog
-frying-pan
-galaxy
-gas-pump
-giraffe
-goat
-golden-gate-bridge
-goldfish
-golf-ball
-goose
-gorilla
-grand-piano-101
-grapes
-grasshopper
-guitar-pick
-hamburger
-hammock
-harmonica
-harp
-harpsichord
-hawksbill-101
-head-phones
-helicopter-101
-hibiscus
-homer-simpson
-horse
-horseshoe-crab
-hot-air-balloon
-hot-dog
-hot-tub
-hourglass
-house-fly
-human-skeleton
-hummingbird
-ibis-101
-ice-cream-cone
-iguana
-ipod
-iris
-jesus-christ
-joy-stick
-kangaroo-101
-kayak
-ketch-101
-killer-whale
-knife
-ladder
-laptop-101
-lathe
-leopards-101
-license-plate
-lightbulb
-light-house
-lightning
-llama-101
-mailbox
-mandolin
-mars
-mattress
-megaphone
-menorah-101
-microscope
-microwave
-minaret
-minotaur
-motorbikes-101
-mountain-bike
-mushroom
-mussels
-necktie
-octopus
-ostrich
-owl
-palm-pilot
-palm-tree
-paperclip
-paper-shredder
-pci-card
-penguin
-people
-pez-dispenser
-photocopier
-picnic-table
-playing-card
-porcupine
-pram
-praying-mantis
-pyramid
-raccoon
-radio-telescope
-rainbow
-refrigerator
-revolver-101
-rifle
-rotary-phone
-roulette-wheel
-saddle
-saturn
-school-bus
-scorpion-101
-screwdriver
-segway
-self-propelled-lawn-mower
-sextant
-sheet-music
-skateboard
-skunk
-skyscraper
-smokestack
-snail
-snake
-sneaker
-snowmobile
-soccer-ball
-socks
-soda-can
-spaghetti
-speed-boat
-spider
-spoon
-stained-glass
-starfish-101
-steering-wheel
-stirrups
-sunflower-101
-superman
-sushi
-swan
-swiss-army-knife
-sword
-syringe
-tambourine
-teapot
-teddy-bear
-teepee
-telephone-box
-tennis-ball
-tennis-court
-tennis-racket
-theodolite
-toaster
-tomato
-tombstone
-top-hat
-touring-bike
-tower-pisa
-traffic-light
-treadmill
-triceratops
-tricycle
-trilobite-101
-tripod
-t-shirt
-tuning-fork
-tweezer
-umbrella-101
-unicorn
-vcr
-video-projector
-washing-machine
-watch-101
-waterfall
-watermelon
-welding-mask
-wheelbarrow
-windmill
-wine-bottle
-xylophone
-yarmulke
-yo-yo
-zebra
-airplanes-101
-car-side-101
-faces-easy-101
-greyhound
-tennis-shoes
-toad
-clutter
diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py
deleted file mode 100644
index e42657e826e..00000000000
--- a/torchvision/prototype/datasets/_builtin/celeba.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import csv
-import pathlib
-from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tuple, Union
-
-from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-)
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-csv.register_dialect("celeba", delimiter=" ", skipinitialspace=True)
-
-
-class CelebACSVParser(IterDataPipe[Tuple[str, Dict[str, str]]]):
-    def __init__(
-        self,
-        datapipe: IterDataPipe[Tuple[Any, BinaryIO]],
-        *,
-        fieldnames: Optional[Sequence[str]] = None,
-    ) -> None:
-        self.datapipe = datapipe
-        self.fieldnames = fieldnames
-
-    def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]:
-        for _, file in self.datapipe:
-            file = (line.decode() for line in file)
-
-            if self.fieldnames:
-                fieldnames = self.fieldnames
-            else:
-                # The first row is skipped, because it only contains the number of samples
-                next(file)
-
-                # Empty field names are filtered out, because some files have an extra white space after the header
-                # line, which is recognized as extra column
-                fieldnames = [name for name in next(csv.reader([next(file)], dialect="celeba")) if name]
-                # Some files do not include a label for the image ID column
-                if fieldnames[0] != "image_id":
-                    fieldnames.insert(0, "image_id")
-
-            for line in csv.DictReader(file, fieldnames=fieldnames, dialect="celeba"):
-                yield line.pop("image_id"), line
-
-
-NAME = "celeba"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict()
-
-
-@register_dataset(NAME)
-class CelebA(Dataset):
-    """
-    - **homepage**: https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "test"))
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        splits = GDriveResource(
-            "0B7EVK8r0v71pY0NSMzRuSXJEVkk",
-            sha256="fc955bcb3ef8fbdf7d5640d9a8693a8431b5f2ee291a5c1449a1549e7e073fe7",
-            file_name="list_eval_partition.txt",
-        )
-        images = GDriveResource(
-            "0B7EVK8r0v71pZjFTYXZWM3FlRnM",
-            sha256="46fb89443c578308acf364d7d379fe1b9efb793042c0af734b6112e4fd3a8c74",
-            file_name="img_align_celeba.zip",
-        )
-        identities = GDriveResource(
-            "1_ee_0u7vcNLOfNLegJRHmolfH5ICW-XS",
-            sha256="c6143857c3e2630ac2da9f782e9c1232e5e59be993a9d44e8a7916c78a6158c0",
-            file_name="identity_CelebA.txt",
-        )
-        attributes = GDriveResource(
-            "0B7EVK8r0v71pblRyaVFSWGxPY0U",
-            sha256="f0e5da289d5ccf75ffe8811132694922b60f2af59256ed362afa03fefba324d0",
-            file_name="list_attr_celeba.txt",
-        )
-        bounding_boxes = GDriveResource(
-            "0B7EVK8r0v71pbThiMVRxWXZ4dU0",
-            sha256="7487a82e57c4bb956c5445ae2df4a91ffa717e903c5fa22874ede0820c8ec41b",
-            file_name="list_bbox_celeba.txt",
-        )
-        landmarks = GDriveResource(
-            "0B7EVK8r0v71pd0FJY3Blby1HUTQ",
-            sha256="6c02a87569907f6db2ba99019085697596730e8129f67a3d61659f198c48d43b",
-            file_name="list_landmarks_align_celeba.txt",
-        )
-        return [splits, images, identities, attributes, bounding_boxes, landmarks]
-
-    def _filter_split(self, data: Tuple[str, Dict[str, str]]) -> bool:
-        split_id = {
-            "train": "0",
-            "val": "1",
-            "test": "2",
-        }[self._split]
-        return data[1]["split_id"] == split_id
-
-    def _prepare_sample(
-        self,
-        data: Tuple[
-            Tuple[str, Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO]]],
-            Tuple[
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-                Tuple[str, Dict[str, str]],
-            ],
-        ],
-    ) -> Dict[str, Any]:
-        split_and_image_data, ann_data = data
-        _, (_, image_data) = split_and_image_data
-        path, buffer = image_data
-
-        image = EncodedImage.from_file(buffer)
-        (_, identity), (_, attributes), (_, bounding_box), (_, landmarks) = ann_data
-
-        return dict(
-            path=path,
-            image=image,
-            identity=Label(int(identity["identity"])),
-            attributes={attr: value == "1" for attr, value in attributes.items()},
-            bounding_box=BoundingBox(
-                [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")],
-                format="xywh",
-                image_size=image.image_size,
-            ),
-            landmarks={
-                landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"])))
-                for landmark in {key[:-2] for key in landmarks.keys()}
-            },
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        splits_dp, images_dp, identities_dp, attributes_dp, bounding_boxes_dp, landmarks_dp = resource_dps
-
-        splits_dp = CelebACSVParser(splits_dp, fieldnames=("image_id", "split_id"))
-        splits_dp = Filter(splits_dp, self._filter_split)
-        splits_dp = hint_shuffling(splits_dp)
-        splits_dp = hint_sharding(splits_dp)
-
-        anns_dp = Zipper(
-            *[
-                CelebACSVParser(dp, fieldnames=fieldnames)
-                for dp, fieldnames in (
-                    (identities_dp, ("image_id", "identity")),
-                    (attributes_dp, None),
-                    (bounding_boxes_dp, None),
-                    (landmarks_dp, None),
-                )
-            ]
-        )
-
-        dp = IterKeyZipper(
-            splits_dp,
-            images_dp,
-            key_fn=getitem(0),
-            ref_key_fn=path_accessor("name"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-            keep_key=True,
-        )
-        dp = IterKeyZipper(
-            dp,
-            anns_dp,
-            key_fn=getitem(0),
-            ref_key_fn=getitem(0, 0),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 162_770,
-            "val": 19_867,
-            "test": 19_962,
-        }[self._split]
diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py
deleted file mode 100644
index 26196ded638..00000000000
--- a/torchvision/prototype/datasets/_builtin/cifar.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import abc
-import io
-import pathlib
-import pickle
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Tuple, Union
-
-import numpy as np
-from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.features import Image, Label
-
-from .._api import register_dataset, register_info
-
-
-class CifarFileReader(IterDataPipe[Tuple[np.ndarray, int]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[str, Any]], *, labels_key: str) -> None:
-        self.datapipe = datapipe
-        self.labels_key = labels_key
-
-    def __iter__(self) -> Iterator[Tuple[np.ndarray, int]]:
-        for mapping in self.datapipe:
-            image_arrays = mapping["data"].reshape((-1, 3, 32, 32))
-            category_idcs = mapping[self.labels_key]
-            yield from iter(zip(image_arrays, category_idcs))
-
-
-class _CifarBase(Dataset):
-    _FILE_NAME: str
-    _SHA256: str
-    _LABELS_KEY: str
-    _META_FILE_NAME: str
-    _CATEGORIES_KEY: str
-    _categories: List[str]
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    @abc.abstractmethod
-    def _is_data_file(self, data: Tuple[str, BinaryIO]) -> Optional[int]:
-        pass
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                f"https://www.cs.toronto.edu/~kriz/{self._FILE_NAME}",
-                sha256=self._SHA256,
-            )
-        ]
-
-    def _unpickle(self, data: Tuple[str, io.BytesIO]) -> Dict[str, Any]:
-        _, file = data
-        return cast(Dict[str, Any], pickle.load(file, encoding="latin1"))
-
-    def _prepare_sample(self, data: Tuple[np.ndarray, int]) -> Dict[str, Any]:
-        image_array, category_idx = data
-        return dict(
-            image=Image(image_array),
-            label=Label(category_idx, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, self._is_data_file)
-        dp = Mapper(dp, self._unpickle)
-        dp = CifarFileReader(dp, labels_key=self._LABELS_KEY)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 50_000 if self._split == "train" else 10_000
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", self._META_FILE_NAME))
-        dp = Mapper(dp, self._unpickle)
-
-        return cast(List[str], next(iter(dp))[self._CATEGORIES_KEY])
-
-
-@register_info("cifar10")
-def _cifar10_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("cifar10"))
-
-
-@register_dataset("cifar10")
-class Cifar10(_CifarBase):
-    """
-    - **homepage**: https://www.cs.toronto.edu/~kriz/cifar.html
-    """
-
-    _FILE_NAME = "cifar-10-python.tar.gz"
-    _SHA256 = "6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce"
-    _LABELS_KEY = "labels"
-    _META_FILE_NAME = "batches.meta"
-    _CATEGORIES_KEY = "label_names"
-    _categories = _cifar10_info()["categories"]
-
-    def _is_data_file(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.name.startswith("data" if self._split == "train" else "test")
-
-
-@register_info("cifar100")
-def _cifar100_info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file("cifar100"))
-
-
-@register_dataset("cifar100")
-class Cifar100(_CifarBase):
-    """
-    - **homepage**: https://www.cs.toronto.edu/~kriz/cifar.html
-    """
-
-    _FILE_NAME = "cifar-100-python.tar.gz"
-    _SHA256 = "85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7"
-    _LABELS_KEY = "fine_labels"
-    _META_FILE_NAME = "meta"
-    _CATEGORIES_KEY = "fine_label_names"
-    _categories = _cifar100_info()["categories"]
-
-    def _is_data_file(self, data: Tuple[str, Any]) -> bool:
-        path = pathlib.Path(data[0])
-        return path.name == self._split
diff --git a/torchvision/prototype/datasets/_builtin/cifar10.categories b/torchvision/prototype/datasets/_builtin/cifar10.categories
deleted file mode 100644
index fa30c22b95d..00000000000
--- a/torchvision/prototype/datasets/_builtin/cifar10.categories
+++ /dev/null
@@ -1,10 +0,0 @@
-airplane
-automobile
-bird
-cat
-deer
-dog
-frog
-horse
-ship
-truck
diff --git a/torchvision/prototype/datasets/_builtin/cifar100.categories b/torchvision/prototype/datasets/_builtin/cifar100.categories
deleted file mode 100644
index 7f7bf51d1ab..00000000000
--- a/torchvision/prototype/datasets/_builtin/cifar100.categories
+++ /dev/null
@@ -1,100 +0,0 @@
-apple
-aquarium_fish
-baby
-bear
-beaver
-bed
-bee
-beetle
-bicycle
-bottle
-bowl
-boy
-bridge
-bus
-butterfly
-camel
-can
-castle
-caterpillar
-cattle
-chair
-chimpanzee
-clock
-cloud
-cockroach
-couch
-crab
-crocodile
-cup
-dinosaur
-dolphin
-elephant
-flatfish
-forest
-fox
-girl
-hamster
-house
-kangaroo
-keyboard
-lamp
-lawn_mower
-leopard
-lion
-lizard
-lobster
-man
-maple_tree
-motorcycle
-mountain
-mouse
-mushroom
-oak_tree
-orange
-orchid
-otter
-palm_tree
-pear
-pickup_truck
-pine_tree
-plain
-plate
-poppy
-porcupine
-possum
-rabbit
-raccoon
-ray
-road
-rocket
-rose
-sea
-seal
-shark
-shrew
-skunk
-skyscraper
-snail
-snake
-spider
-squirrel
-streetcar
-sunflower
-sweet_pepper
-table
-tank
-telephone
-television
-tiger
-tractor
-train
-trout
-tulip
-turtle
-wardrobe
-whale
-willow_tree
-wolf
-woman
-worm
diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py
deleted file mode 100644
index 4ddacdfb982..00000000000
--- a/torchvision/prototype/datasets/_builtin/clevr.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-)
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "clevr"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict()
-
-
-@register_dataset(NAME)
-class CLEVR(Dataset):
-    """
-    - **homepage**: https://cs.stanford.edu/people/jcjohns/clevr/
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "train", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "test"))
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip",
-            sha256="5cd61cf1096ed20944df93c9adb31e74d189b8459a94f54ba00090e5c59936d1",
-        )
-        return [archive]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.parents[1].name == "images":
-            return 0
-        elif path.parent.name == "scenes":
-            return 1
-        else:
-            return None
-
-    def _filter_scene_anns(self, data: Tuple[str, Any]) -> bool:
-        key, _ = data
-        return key == "scenes"
-
-    def _add_empty_anns(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[str, BinaryIO], None]:
-        return data, None
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Optional[Dict[str, Any]]]) -> Dict[str, Any]:
-        image_data, scenes_data = data
-        path, buffer = image_data
-
-        return dict(
-            path=path,
-            image=EncodedImage.from_file(buffer),
-            label=Label(len(scenes_data["objects"])) if scenes_data else None,
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        images_dp, scenes_dp = Demultiplexer(
-            archive_dp,
-            2,
-            self._classify_archive,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        images_dp = Filter(images_dp, path_comparator("parent.name", self._split))
-        images_dp = hint_shuffling(images_dp)
-        images_dp = hint_sharding(images_dp)
-
-        if self._split != "test":
-            scenes_dp = Filter(scenes_dp, path_comparator("name", f"CLEVR_{self._split}_scenes.json"))
-            scenes_dp = JsonParser(scenes_dp)
-            scenes_dp = Mapper(scenes_dp, getitem(1, "scenes"))
-            scenes_dp = UnBatcher(scenes_dp)
-
-            dp = IterKeyZipper(
-                images_dp,
-                scenes_dp,
-                key_fn=path_accessor("name"),
-                ref_key_fn=getitem("image_filename"),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-        else:
-            dp = Mapper(images_dp, self._add_empty_anns)
-
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 70_000 if self._split == "train" else 15_000
diff --git a/torchvision/prototype/datasets/_builtin/coco.categories b/torchvision/prototype/datasets/_builtin/coco.categories
deleted file mode 100644
index 27e612f6d7d..00000000000
--- a/torchvision/prototype/datasets/_builtin/coco.categories
+++ /dev/null
@@ -1,91 +0,0 @@
-__background__,N/A
-person,person
-bicycle,vehicle
-car,vehicle
-motorcycle,vehicle
-airplane,vehicle
-bus,vehicle
-train,vehicle
-truck,vehicle
-boat,vehicle
-traffic light,outdoor
-fire hydrant,outdoor
-N/A,N/A
-stop sign,outdoor
-parking meter,outdoor
-bench,outdoor
-bird,animal
-cat,animal
-dog,animal
-horse,animal
-sheep,animal
-cow,animal
-elephant,animal
-bear,animal
-zebra,animal
-giraffe,animal
-N/A,N/A
-backpack,accessory
-umbrella,accessory
-N/A,N/A
-N/A,N/A
-handbag,accessory
-tie,accessory
-suitcase,accessory
-frisbee,sports
-skis,sports
-snowboard,sports
-sports ball,sports
-kite,sports
-baseball bat,sports
-baseball glove,sports
-skateboard,sports
-surfboard,sports
-tennis racket,sports
-bottle,kitchen
-N/A,N/A
-wine glass,kitchen
-cup,kitchen
-fork,kitchen
-knife,kitchen
-spoon,kitchen
-bowl,kitchen
-banana,food
-apple,food
-sandwich,food
-orange,food
-broccoli,food
-carrot,food
-hot dog,food
-pizza,food
-donut,food
-cake,food
-chair,furniture
-couch,furniture
-potted plant,furniture
-bed,furniture
-N/A,N/A
-dining table,furniture
-N/A,N/A
-N/A,N/A
-toilet,furniture
-N/A,N/A
-tv,electronic
-laptop,electronic
-mouse,electronic
-remote,electronic
-keyboard,electronic
-cell phone,electronic
-microwave,appliance
-oven,appliance
-toaster,appliance
-sink,appliance
-refrigerator,appliance
-N/A,N/A
-book,indoor
-clock,indoor
-vase,indoor
-scissors,indoor
-teddy bear,indoor
-hair drier,indoor
-toothbrush,indoor
diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py
deleted file mode 100644
index 16a16998bf7..00000000000
--- a/torchvision/prototype/datasets/_builtin/coco.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import pathlib
-import re
-from collections import defaultdict, OrderedDict
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import (
-    Demultiplexer,
-    Filter,
-    Grouper,
-    IterDataPipe,
-    IterKeyZipper,
-    JsonParser,
-    Mapper,
-    UnBatcher,
-)
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    MappingIterator,
-    path_accessor,
-    read_categories_file,
-)
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "coco"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    categories, super_categories = zip(*read_categories_file(NAME))
-    return dict(categories=categories, super_categories=super_categories)
-
-
-@register_dataset(NAME)
-class Coco(Dataset):
-    """
-    - **homepage**: https://cocodataset.org/
-    - **dependencies**:
-        - <pycocotools `https://github.com/cocodataset/cocoapi`>_
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        year: str = "2017",
-        annotations: Optional[str] = "instances",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val"})
-        self._year = self._verify_str_arg(year, "year", {"2017", "2014"})
-        self._annotations = (
-            self._verify_str_arg(annotations, "annotations", self._ANN_DECODERS.keys())
-            if annotations is not None
-            else None
-        )
-
-        info = _info()
-        categories, super_categories = info["categories"], info["super_categories"]
-        self._categories = categories
-        self._category_to_super_category = dict(zip(categories, super_categories))
-
-        super().__init__(root, dependencies=("pycocotools",), skip_integrity_check=skip_integrity_check)
-
-    _IMAGE_URL_BASE = "http://images.cocodataset.org/zips"
-
-    _IMAGES_CHECKSUMS = {
-        ("2014", "train"): "ede4087e640bddba550e090eae701092534b554b42b05ac33f0300b984b31775",
-        ("2014", "val"): "fe9be816052049c34717e077d9e34aa60814a55679f804cd043e3cbee3b9fde0",
-        ("2017", "train"): "69a8bb58ea5f8f99d24875f21416de2e9ded3178e903f1f7603e283b9e06d929",
-        ("2017", "val"): "4f7e2ccb2866ec5041993c9cf2a952bbed69647b115d0f74da7ce8f4bef82f05",
-    }
-
-    _META_URL_BASE = "http://images.cocodataset.org/annotations"
-
-    _META_CHECKSUMS = {
-        "2014": "031296bbc80c45a1d1f76bf9a90ead27e94e99ec629208449507a4917a3bf009",
-        "2017": "113a836d90195ee1f884e704da6304dfaaecff1f023f49b6ca93c4aaae470268",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        images = HttpResource(
-            f"{self._IMAGE_URL_BASE}/{self._split}{self._year}.zip",
-            sha256=self._IMAGES_CHECKSUMS[(self._year, self._split)],
-        )
-        meta = HttpResource(
-            f"{self._META_URL_BASE}/annotations_trainval{self._year}.zip",
-            sha256=self._META_CHECKSUMS[self._year],
-        )
-        return [images, meta]
-
-    def _segmentation_to_mask(self, segmentation: Any, *, is_crowd: bool, image_size: Tuple[int, int]) -> torch.Tensor:
-        from pycocotools import mask
-
-        if is_crowd:
-            segmentation = mask.frPyObjects(segmentation, *image_size)
-        else:
-            segmentation = mask.merge(mask.frPyObjects(segmentation, *image_size))
-
-        return torch.from_numpy(mask.decode(segmentation)).to(torch.bool)
-
-    def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        image_size = (image_meta["height"], image_meta["width"])
-        labels = [ann["category_id"] for ann in anns]
-        return dict(
-            # TODO: create a segmentation feature
-            segmentations=_Feature(
-                torch.stack(
-                    [
-                        self._segmentation_to_mask(ann["segmentation"], is_crowd=ann["iscrowd"], image_size=image_size)
-                        for ann in anns
-                    ]
-                )
-            ),
-            areas=_Feature([ann["area"] for ann in anns]),
-            crowds=_Feature([ann["iscrowd"] for ann in anns], dtype=torch.bool),
-            bounding_boxes=BoundingBox(
-                [ann["bbox"] for ann in anns],
-                format="xywh",
-                image_size=image_size,
-            ),
-            labels=Label(labels, categories=self._categories),
-            super_categories=[self._category_to_super_category[self._categories[label]] for label in labels],
-            ann_ids=[ann["id"] for ann in anns],
-        )
-
-    def _decode_captions_ann(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]:
-        return dict(
-            captions=[ann["caption"] for ann in anns],
-            ann_ids=[ann["id"] for ann in anns],
-        )
-
-    _ANN_DECODERS = OrderedDict(
-        [
-            ("instances", _decode_instances_anns),
-            ("captions", _decode_captions_ann),
-        ]
-    )
-
-    _META_FILE_PATTERN = re.compile(
-        rf"(?P<annotations>({'|'.join(_ANN_DECODERS.keys())}))_(?P<split>[a-zA-Z]+)(?P<year>\d+)[.]json"
-    )
-
-    def _filter_meta_files(self, data: Tuple[str, Any]) -> bool:
-        match = self._META_FILE_PATTERN.match(pathlib.Path(data[0]).name)
-        return bool(
-            match
-            and match["split"] == self._split
-            and match["year"] == self._year
-            and match["annotations"] == self._annotations
-        )
-
-    def _classify_meta(self, data: Tuple[str, Any]) -> Optional[int]:
-        key, _ = data
-        if key == "images":
-            return 0
-        elif key == "annotations":
-            return 1
-        else:
-            return None
-
-    def _prepare_image(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]:
-        path, buffer = data
-        return dict(
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Tuple[List[Dict[str, Any]], Dict[str, Any]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
-        ann_data, image_data = data
-        anns, image_meta = ann_data
-
-        sample = self._prepare_image(image_data)
-        # this method is only called if we have annotations
-        annotations = cast(str, self._annotations)
-        sample.update(self._ANN_DECODERS[annotations](self, anns, image_meta))
-        return sample
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, meta_dp = resource_dps
-
-        if self._annotations is None:
-            dp = hint_shuffling(images_dp)
-            dp = hint_sharding(dp)
-            dp = hint_shuffling(dp)
-            return Mapper(dp, self._prepare_image)
-
-        meta_dp = Filter(meta_dp, self._filter_meta_files)
-        meta_dp = JsonParser(meta_dp)
-        meta_dp = Mapper(meta_dp, getitem(1))
-        meta_dp: IterDataPipe[Dict[str, Dict[str, Any]]] = MappingIterator(meta_dp)
-        images_meta_dp, anns_meta_dp = Demultiplexer(
-            meta_dp,
-            2,
-            self._classify_meta,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        images_meta_dp = Mapper(images_meta_dp, getitem(1))
-        images_meta_dp = UnBatcher(images_meta_dp)
-
-        anns_meta_dp = Mapper(anns_meta_dp, getitem(1))
-        anns_meta_dp = UnBatcher(anns_meta_dp)
-        anns_meta_dp = Grouper(anns_meta_dp, group_key_fn=getitem("image_id"), buffer_size=INFINITE_BUFFER_SIZE)
-        anns_meta_dp = hint_shuffling(anns_meta_dp)
-        anns_meta_dp = hint_sharding(anns_meta_dp)
-
-        anns_dp = IterKeyZipper(
-            anns_meta_dp,
-            images_meta_dp,
-            key_fn=getitem(0, "image_id"),
-            ref_key_fn=getitem("id"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        dp = IterKeyZipper(
-            anns_dp,
-            images_dp,
-            key_fn=getitem(1, "file_name"),
-            ref_key_fn=path_accessor("name"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            ("train", "2017"): defaultdict(lambda: 118_287, instances=117_266),
-            ("train", "2014"): defaultdict(lambda: 82_783, instances=82_081),
-            ("val", "2017"): defaultdict(lambda: 5_000, instances=4_952),
-            ("val", "2014"): defaultdict(lambda: 40_504, instances=40_137),
-        }[(self._split, self._year)][
-            self._annotations  # type: ignore[index]
-        ]
-
-    def _generate_categories(self) -> Tuple[Tuple[str, str]]:
-        self._annotations = "instances"
-        resources = self._resources()
-
-        dp = resources[1].load(self._root)
-        dp = Filter(dp, self._filter_meta_files)
-        dp = JsonParser(dp)
-
-        _, meta = next(iter(dp))
-        # List[Tuple[super_category, id, category]]
-        label_data = [cast(Tuple[str, int, str], tuple(info.values())) for info in meta["categories"]]
-
-        # COCO actually defines 91 categories, but only 80 of them have instances. Still, the category_id refers to the
-        # full set. To keep the labels dense, we fill the gaps with N/A. Note that there are only 10 gaps, so the total
-        # number of categories is 90 rather than 91.
-        _, ids, _ = zip(*label_data)
-        missing_ids = set(range(1, max(ids) + 1)) - set(ids)
-        label_data.extend([("N/A", id, "N/A") for id in missing_ids])
-
-        # We also add a background category to be used during segmentation.
-        label_data.append(("N/A", 0, "__background__"))
-
-        super_categories, _, categories = zip(*sorted(label_data, key=lambda info: info[1]))
-
-        return cast(Tuple[Tuple[str, str]], tuple(zip(categories, super_categories)))
diff --git a/torchvision/prototype/datasets/_builtin/country211.categories b/torchvision/prototype/datasets/_builtin/country211.categories
deleted file mode 100644
index 6fc3e99a185..00000000000
--- a/torchvision/prototype/datasets/_builtin/country211.categories
+++ /dev/null
@@ -1,211 +0,0 @@
-AD
-AE
-AF
-AG
-AI
-AL
-AM
-AO
-AQ
-AR
-AT
-AU
-AW
-AX
-AZ
-BA
-BB
-BD
-BE
-BF
-BG
-BH
-BJ
-BM
-BN
-BO
-BQ
-BR
-BS
-BT
-BW
-BY
-BZ
-CA
-CD
-CF
-CH
-CI
-CK
-CL
-CM
-CN
-CO
-CR
-CU
-CV
-CW
-CY
-CZ
-DE
-DK
-DM
-DO
-DZ
-EC
-EE
-EG
-ES
-ET
-FI
-FJ
-FK
-FO
-FR
-GA
-GB
-GD
-GE
-GF
-GG
-GH
-GI
-GL
-GM
-GP
-GR
-GS
-GT
-GU
-GY
-HK
-HN
-HR
-HT
-HU
-ID
-IE
-IL
-IM
-IN
-IQ
-IR
-IS
-IT
-JE
-JM
-JO
-JP
-KE
-KG
-KH
-KN
-KP
-KR
-KW
-KY
-KZ
-LA
-LB
-LC
-LI
-LK
-LR
-LT
-LU
-LV
-LY
-MA
-MC
-MD
-ME
-MF
-MG
-MK
-ML
-MM
-MN
-MO
-MQ
-MR
-MT
-MU
-MV
-MW
-MX
-MY
-MZ
-NA
-NC
-NG
-NI
-NL
-NO
-NP
-NZ
-OM
-PA
-PE
-PF
-PG
-PH
-PK
-PL
-PR
-PS
-PT
-PW
-PY
-QA
-RE
-RO
-RS
-RU
-RW
-SA
-SB
-SC
-SD
-SE
-SG
-SH
-SI
-SJ
-SK
-SL
-SM
-SN
-SO
-SS
-SV
-SX
-SY
-SZ
-TG
-TH
-TJ
-TL
-TM
-TN
-TO
-TR
-TT
-TW
-TZ
-UA
-UG
-US
-UY
-UZ
-VA
-VE
-VG
-VI
-VN
-VU
-WS
-XK
-YE
-ZA
-ZM
-ZW
diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py
deleted file mode 100644
index f9821ea4eb6..00000000000
--- a/torchvision/prototype/datasets/_builtin/country211.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Tuple, Union
-
-from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "country211"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class Country211(Dataset):
-    """
-    - **homepage**: https://github.com/openai/CLIP/blob/main/data/country211.md
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "test"))
-        self._split_folder_name = "valid" if split == "val" else split
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                "https://openaipublic.azureedge.net/clip/data/country211.tgz",
-                sha256="c011343cdc1296a8c31ff1d7129cf0b5e5b8605462cffd24f89266d6e6f4da3c",
-            )
-        ]
-
-    def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
-        path, buffer = data
-        category = pathlib.Path(path).parent.name
-        return dict(
-            label=Label.from_category(category, categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _filter_split(self, data: Tuple[str, Any], *, split: str) -> bool:
-        return pathlib.Path(data[0]).parent.parent.name == split
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Filter(dp, path_comparator("parent.parent.name", self._split_folder_name))
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 31_650,
-            "val": 10_550,
-            "test": 21_100,
-        }[self._split]
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-        dp = resources[0].load(self._root)
-        return sorted({pathlib.Path(path).parent.name for path, _ in dp})
diff --git a/torchvision/prototype/datasets/_builtin/cub200.categories b/torchvision/prototype/datasets/_builtin/cub200.categories
deleted file mode 100644
index f91754c930c..00000000000
--- a/torchvision/prototype/datasets/_builtin/cub200.categories
+++ /dev/null
@@ -1,200 +0,0 @@
-Black_footed_Albatross
-Laysan_Albatross
-Sooty_Albatross
-Groove_billed_Ani
-Crested_Auklet
-Least_Auklet
-Parakeet_Auklet
-Rhinoceros_Auklet
-Brewer_Blackbird
-Red_winged_Blackbird
-Rusty_Blackbird
-Yellow_headed_Blackbird
-Bobolink
-Indigo_Bunting
-Lazuli_Bunting
-Painted_Bunting
-Cardinal
-Spotted_Catbird
-Gray_Catbird
-Yellow_breasted_Chat
-Eastern_Towhee
-Chuck_will_Widow
-Brandt_Cormorant
-Red_faced_Cormorant
-Pelagic_Cormorant
-Bronzed_Cowbird
-Shiny_Cowbird
-Brown_Creeper
-American_Crow
-Fish_Crow
-Black_billed_Cuckoo
-Mangrove_Cuckoo
-Yellow_billed_Cuckoo
-Gray_crowned_Rosy_Finch
-Purple_Finch
-Northern_Flicker
-Acadian_Flycatcher
-Great_Crested_Flycatcher
-Least_Flycatcher
-Olive_sided_Flycatcher
-Scissor_tailed_Flycatcher
-Vermilion_Flycatcher
-Yellow_bellied_Flycatcher
-Frigatebird
-Northern_Fulmar
-Gadwall
-American_Goldfinch
-European_Goldfinch
-Boat_tailed_Grackle
-Eared_Grebe
-Horned_Grebe
-Pied_billed_Grebe
-Western_Grebe
-Blue_Grosbeak
-Evening_Grosbeak
-Pine_Grosbeak
-Rose_breasted_Grosbeak
-Pigeon_Guillemot
-California_Gull
-Glaucous_winged_Gull
-Heermann_Gull
-Herring_Gull
-Ivory_Gull
-Ring_billed_Gull
-Slaty_backed_Gull
-Western_Gull
-Anna_Hummingbird
-Ruby_throated_Hummingbird
-Rufous_Hummingbird
-Green_Violetear
-Long_tailed_Jaeger
-Pomarine_Jaeger
-Blue_Jay
-Florida_Jay
-Green_Jay
-Dark_eyed_Junco
-Tropical_Kingbird
-Gray_Kingbird
-Belted_Kingfisher
-Green_Kingfisher
-Pied_Kingfisher
-Ringed_Kingfisher
-White_breasted_Kingfisher
-Red_legged_Kittiwake
-Horned_Lark
-Pacific_Loon
-Mallard
-Western_Meadowlark
-Hooded_Merganser
-Red_breasted_Merganser
-Mockingbird
-Nighthawk
-Clark_Nutcracker
-White_breasted_Nuthatch
-Baltimore_Oriole
-Hooded_Oriole
-Orchard_Oriole
-Scott_Oriole
-Ovenbird
-Brown_Pelican
-White_Pelican
-Western_Wood_Pewee
-Sayornis
-American_Pipit
-Whip_poor_Will
-Horned_Puffin
-Common_Raven
-White_necked_Raven
-American_Redstart
-Geococcyx
-Loggerhead_Shrike
-Great_Grey_Shrike
-Baird_Sparrow
-Black_throated_Sparrow
-Brewer_Sparrow
-Chipping_Sparrow
-Clay_colored_Sparrow
-House_Sparrow
-Field_Sparrow
-Fox_Sparrow
-Grasshopper_Sparrow
-Harris_Sparrow
-Henslow_Sparrow
-Le_Conte_Sparrow
-Lincoln_Sparrow
-Nelson_Sharp_tailed_Sparrow
-Savannah_Sparrow
-Seaside_Sparrow
-Song_Sparrow
-Tree_Sparrow
-Vesper_Sparrow
-White_crowned_Sparrow
-White_throated_Sparrow
-Cape_Glossy_Starling
-Bank_Swallow
-Barn_Swallow
-Cliff_Swallow
-Tree_Swallow
-Scarlet_Tanager
-Summer_Tanager
-Artic_Tern
-Black_Tern
-Caspian_Tern
-Common_Tern
-Elegant_Tern
-Forsters_Tern
-Least_Tern
-Green_tailed_Towhee
-Brown_Thrasher
-Sage_Thrasher
-Black_capped_Vireo
-Blue_headed_Vireo
-Philadelphia_Vireo
-Red_eyed_Vireo
-Warbling_Vireo
-White_eyed_Vireo
-Yellow_throated_Vireo
-Bay_breasted_Warbler
-Black_and_white_Warbler
-Black_throated_Blue_Warbler
-Blue_winged_Warbler
-Canada_Warbler
-Cape_May_Warbler
-Cerulean_Warbler
-Chestnut_sided_Warbler
-Golden_winged_Warbler
-Hooded_Warbler
-Kentucky_Warbler
-Magnolia_Warbler
-Mourning_Warbler
-Myrtle_Warbler
-Nashville_Warbler
-Orange_crowned_Warbler
-Palm_Warbler
-Pine_Warbler
-Prairie_Warbler
-Prothonotary_Warbler
-Swainson_Warbler
-Tennessee_Warbler
-Wilson_Warbler
-Worm_eating_Warbler
-Yellow_Warbler
-Northern_Waterthrush
-Louisiana_Waterthrush
-Bohemian_Waxwing
-Cedar_Waxwing
-American_Three_toed_Woodpecker
-Pileated_Woodpecker
-Red_bellied_Woodpecker
-Red_cockaded_Woodpecker
-Red_headed_Woodpecker
-Downy_Woodpecker
-Bewick_Wren
-Cactus_Wren
-Carolina_Wren
-House_Wren
-Marsh_Wren
-Rock_Wren
-Winter_Wren
-Common_Yellowthroat
diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py
deleted file mode 100644
index c07166a960c..00000000000
--- a/torchvision/prototype/datasets/_builtin/cub200.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import csv
-import functools
-import pathlib
-from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import (
-    CSVDictParser,
-    CSVParser,
-    Demultiplexer,
-    Filter,
-    IterDataPipe,
-    IterKeyZipper,
-    LineReader,
-    Mapper,
-)
-from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-csv.register_dialect("cub200", delimiter=" ")
-
-
-NAME = "cub200"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class CUB200(Dataset):
-    """
-    - **homepage**: http://www.vision.caltech.edu/visipedia/CUB-200.html
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        year: str = "2011",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        self._year = self._verify_str_arg(year, "year", ("2010", "2011"))
-
-        self._categories = _info()["categories"]
-
-        super().__init__(
-            root,
-            # TODO: this will only be available after https://github.com/pytorch/vision/pull/5473
-            # dependencies=("scipy",),
-            skip_integrity_check=skip_integrity_check,
-        )
-
-    def _resources(self) -> List[OnlineResource]:
-        if self._year == "2011":
-            archive = GDriveResource(
-                "1hbzc_P1FuxMkcabkgn9ZKinBwW683j45",
-                file_name="CUB_200_2011.tgz",
-                sha256="0c685df5597a8b24909f6a7c9db6d11e008733779a671760afef78feb49bf081",
-                preprocess="decompress",
-            )
-            segmentations = GDriveResource(
-                "1EamOKGLoTuZdtcVYbHMWNpkn3iAVj8TP",
-                file_name="segmentations.tgz",
-                sha256="dc77f6cffea0cbe2e41d4201115c8f29a6320ecb04fffd2444f51b8066e4b84f",
-                preprocess="decompress",
-            )
-            return [archive, segmentations]
-        else:  # self._year == "2010"
-            split = GDriveResource(
-                "1vZuZPqha0JjmwkdaS_XtYryE3Jf5Q1AC",
-                file_name="lists.tgz",
-                sha256="aeacbd5e3539ae84ea726e8a266a9a119c18f055cd80f3836d5eb4500b005428",
-                preprocess="decompress",
-            )
-            images = GDriveResource(
-                "1GDr1OkoXdhaXWGA8S3MAq3a522Tak-nx",
-                file_name="images.tgz",
-                sha256="2a6d2246bbb9778ca03aa94e2e683ccb4f8821a36b7f235c0822e659d60a803e",
-                preprocess="decompress",
-            )
-            anns = GDriveResource(
-                "16NsbTpMs5L6hT4hUJAmpW2u7wH326WTR",
-                file_name="annotations.tgz",
-                sha256="c17b7841c21a66aa44ba8fe92369cc95dfc998946081828b1d7b8a4b716805c1",
-                preprocess="decompress",
-            )
-            return [split, images, anns]
-
-    def _2011_classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.parents[1].name == "images":
-            return 0
-        elif path.name == "train_test_split.txt":
-            return 1
-        elif path.name == "images.txt":
-            return 2
-        elif path.name == "bounding_boxes.txt":
-            return 3
-        else:
-            return None
-
-    def _2011_extract_file_name(self, rel_posix_path: str) -> str:
-        return rel_posix_path.rsplit("/", maxsplit=1)[1]
-
-    def _2011_filter_split(self, row: List[str]) -> bool:
-        _, split_id = row
-        return {
-            "0": "test",
-            "1": "train",
-        }[split_id] == self._split
-
-    def _2011_segmentation_key(self, data: Tuple[str, Any]) -> str:
-        path = pathlib.Path(data[0])
-        return path.with_suffix(".jpg").name
-
-    def _2011_prepare_ann(
-        self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], image_size: Tuple[int, int]
-    ) -> Dict[str, Any]:
-        _, (bounding_box_data, segmentation_data) = data
-        segmentation_path, segmentation_buffer = segmentation_data
-        return dict(
-            bounding_box=BoundingBox(
-                [float(part) for part in bounding_box_data[1:]], format="xywh", image_size=image_size
-            ),
-            segmentation_path=segmentation_path,
-            segmentation=EncodedImage.from_file(segmentation_buffer),
-        )
-
-    def _2010_split_key(self, data: str) -> str:
-        return data.rsplit("/", maxsplit=1)[1]
-
-    def _2010_anns_key(self, data: Tuple[str, BinaryIO]) -> Tuple[str, Tuple[str, BinaryIO]]:
-        path = pathlib.Path(data[0])
-        return path.with_suffix(".jpg").name, data
-
-    def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: Tuple[int, int]) -> Dict[str, Any]:
-        _, (path, buffer) = data
-        content = read_mat(buffer)
-        return dict(
-            ann_path=path,
-            bounding_box=BoundingBox(
-                [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
-                format="xyxy",
-                image_size=image_size,
-            ),
-            segmentation=_Feature(content["seg"]),
-        )
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Tuple[str, Tuple[str, BinaryIO]], Any],
-        *,
-        prepare_ann_fn: Callable[[Any, Tuple[int, int]], Dict[str, Any]],
-    ) -> Dict[str, Any]:
-        data, anns_data = data
-        _, image_data = data
-        path, buffer = image_data
-
-        image = EncodedImage.from_file(buffer)
-
-        return dict(
-            prepare_ann_fn(anns_data, image.image_size),
-            image=image,
-            label=Label(int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]), categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        prepare_ann_fn: Callable
-        if self._year == "2011":
-            archive_dp, segmentations_dp = resource_dps
-            images_dp, split_dp, image_files_dp, bounding_boxes_dp = Demultiplexer(
-                archive_dp, 4, self._2011_classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-            )
-
-            image_files_dp = CSVParser(image_files_dp, dialect="cub200")
-            image_files_dp = Mapper(image_files_dp, self._2011_extract_file_name, input_col=1)
-            image_files_map = IterToMapConverter(image_files_dp)
-
-            split_dp = CSVParser(split_dp, dialect="cub200")
-            split_dp = Filter(split_dp, self._2011_filter_split)
-            split_dp = Mapper(split_dp, getitem(0))
-            split_dp = Mapper(split_dp, image_files_map.__getitem__)
-
-            bounding_boxes_dp = CSVParser(bounding_boxes_dp, dialect="cub200")
-            bounding_boxes_dp = Mapper(bounding_boxes_dp, image_files_map.__getitem__, input_col=0)
-
-            anns_dp = IterKeyZipper(
-                bounding_boxes_dp,
-                segmentations_dp,
-                key_fn=getitem(0),
-                ref_key_fn=self._2011_segmentation_key,
-                keep_key=True,
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-
-            prepare_ann_fn = self._2011_prepare_ann
-        else:  # self._year == "2010"
-            split_dp, images_dp, anns_dp = resource_dps
-
-            split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-            split_dp = LineReader(split_dp, decode=True, return_path=False)
-            split_dp = Mapper(split_dp, self._2010_split_key)
-
-            anns_dp = Mapper(anns_dp, self._2010_anns_key)
-
-            prepare_ann_fn = self._2010_prepare_ann
-
-        split_dp = hint_shuffling(split_dp)
-        split_dp = hint_sharding(split_dp)
-
-        dp = IterKeyZipper(
-            split_dp,
-            images_dp,
-            getitem(),
-            path_accessor("name"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        dp = IterKeyZipper(
-            dp,
-            anns_dp,
-            getitem(0),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, functools.partial(self._prepare_sample, prepare_ann_fn=prepare_ann_fn))
-
-    def __len__(self) -> int:
-        return {
-            ("train", "2010"): 3_000,
-            ("test", "2010"): 3_033,
-            ("train", "2011"): 5_994,
-            ("test", "2011"): 5_794,
-        }[(self._split, self._year)]
-
-    def _generate_categories(self) -> List[str]:
-        self._year = "2011"
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", "classes.txt"))
-        dp = CSVDictParser(dp, fieldnames=("label", "category"), dialect="cub200")
-
-        return [row["category"].split(".")[1] for row in dp]
diff --git a/torchvision/prototype/datasets/_builtin/dtd.categories b/torchvision/prototype/datasets/_builtin/dtd.categories
deleted file mode 100644
index 7f3df8a2b00..00000000000
--- a/torchvision/prototype/datasets/_builtin/dtd.categories
+++ /dev/null
@@ -1,47 +0,0 @@
-banded
-blotchy
-braided
-bubbly
-bumpy
-chequered
-cobwebbed
-cracked
-crosshatched
-crystalline
-dotted
-fibrous
-flecked
-freckled
-frilly
-gauzy
-grid
-grooved
-honeycombed
-interlaced
-knitted
-lacelike
-lined
-marbled
-matted
-meshed
-paisley
-perforated
-pitted
-pleated
-polka-dotted
-porous
-potholed
-scaly
-smeared
-spiralled
-sprinkled
-stained
-stratified
-striped
-studded
-swirly
-veined
-waffled
-woven
-wrinkled
-zigzagged
diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py
deleted file mode 100644
index e7ff1e79559..00000000000
--- a/torchvision/prototype/datasets/_builtin/dtd.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import enum
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "dtd"
-
-
-class DTDDemux(enum.IntEnum):
-    SPLIT = 0
-    JOINT_CATEGORIES = 1
-    IMAGES = 2
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class DTD(Dataset):
-    """DTD Dataset.
-    homepage="https://www.robots.ox.ac.uk/~vgg/data/dtd/",
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        fold: int = 1,
-        skip_validation_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val", "test"})
-
-        if not (1 <= fold <= 10):
-            raise ValueError(f"The fold parameter should be an integer in [1, 10]. Got {fold}")
-        self._fold = fold
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_validation_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz",
-            sha256="e42855a52a4950a3b59612834602aa253914755c95b0cff9ead6d07395f8e205",
-            preprocess="decompress",
-        )
-        return [archive]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.parent.name == "labels":
-            if path.name == "labels_joint_anno.txt":
-                return DTDDemux.JOINT_CATEGORIES
-
-            return DTDDemux.SPLIT
-        elif path.parents[1].name == "images":
-            return DTDDemux.IMAGES
-        else:
-            return None
-
-    def _image_key_fn(self, data: Tuple[str, Any]) -> str:
-        path = pathlib.Path(data[0])
-        # The split files contain hardcoded posix paths for the images, e.g. banded/banded_0001.jpg
-        return str(path.relative_to(path.parents[1]).as_posix())
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, List[str]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
-        (_, joint_categories_data), image_data = data
-        _, *joint_categories = joint_categories_data
-        path, buffer = image_data
-
-        category = pathlib.Path(path).parent.name
-
-        return dict(
-            joint_categories={category for category in joint_categories if category},
-            label=Label.from_category(category, categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-
-        splits_dp, joint_categories_dp, images_dp = Demultiplexer(
-            archive_dp, 3, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-        )
-
-        splits_dp = Filter(splits_dp, path_comparator("name", f"{self._split}{self._fold}.txt"))
-        splits_dp = LineReader(splits_dp, decode=True, return_path=False)
-        splits_dp = hint_shuffling(splits_dp)
-        splits_dp = hint_sharding(splits_dp)
-
-        joint_categories_dp = CSVParser(joint_categories_dp, delimiter=" ")
-
-        dp = IterKeyZipper(
-            splits_dp,
-            joint_categories_dp,
-            key_fn=getitem(),
-            ref_key_fn=getitem(0),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        dp = IterKeyZipper(
-            dp,
-            images_dp,
-            key_fn=getitem(0),
-            ref_key_fn=self._image_key_fn,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
-        return self._classify_archive(data) == DTDDemux.IMAGES
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, self._filter_images)
-
-        return sorted({pathlib.Path(path).parent.name for path, _ in dp})
-
-    def __len__(self) -> int:
-        return 1_880  # All splits have the same length
diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py
deleted file mode 100644
index 88863dbcb3a..00000000000
--- a/torchvision/prototype/datasets/_builtin/eurosat.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Tuple, Union
-
-from torchdata.datapipes.iter import IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "eurosat"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(
-        categories=(
-            "AnnualCrop",
-            "Forest",
-            "HerbaceousVegetation",
-            "Highway",
-            "Industrial",
-            "Pasture",
-            "PermanentCrop",
-            "Residential",
-            "River",
-            "SeaLake",
-        )
-    )
-
-
-@register_dataset(NAME)
-class EuroSAT(Dataset):
-    """EuroSAT Dataset.
-    homepage="https://github.com/phelber/eurosat",
-    """
-
-    def __init__(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False) -> None:
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
-                sha256="8ebea626349354c5328b142b96d0430e647051f26efc2dc974c843f25ecf70bd",
-            )
-        ]
-
-    def _prepare_sample(self, data: Tuple[str, Any]) -> Dict[str, Any]:
-        path, buffer = data
-        category = pathlib.Path(path).parent.name
-        return dict(
-            label=Label.from_category(category, categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 27_000
diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py
deleted file mode 100644
index b2693aa96c0..00000000000
--- a/torchvision/prototype/datasets/_builtin/fer2013.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Union
-
-import torch
-from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "fer2013"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=("angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"))
-
-
-@register_dataset(NAME)
-class FER2013(Dataset):
-    """FER 2013 Dataset
-    homepage="https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge"
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "train", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _CHECKSUMS = {
-        "train": "a2b7c9360cc0b38d21187e5eece01c2799fce5426cdeecf746889cc96cda2d10",
-        "test": "dec8dfe8021e30cd6704b85ec813042b4a5d99d81cb55e023291a94104f575c3",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = KaggleDownloadResource(
-            "https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge",
-            file_name=f"{self._split}.csv.zip",
-            sha256=self._CHECKSUMS[self._split],
-        )
-        return [archive]
-
-    def _prepare_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        label_id = data.get("emotion")
-
-        return dict(
-            image=Image(torch.tensor([int(idx) for idx in data["pixels"].split()], dtype=torch.uint8).reshape(48, 48)),
-            label=Label(int(label_id), categories=self._categories) if label_id is not None else None,
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = CSVDictParser(dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 28_709 if self._split == "train" else 3_589
diff --git a/torchvision/prototype/datasets/_builtin/food101.categories b/torchvision/prototype/datasets/_builtin/food101.categories
deleted file mode 100644
index 59f252ddff4..00000000000
--- a/torchvision/prototype/datasets/_builtin/food101.categories
+++ /dev/null
@@ -1,101 +0,0 @@
-apple_pie
-baby_back_ribs
-baklava
-beef_carpaccio
-beef_tartare
-beet_salad
-beignets
-bibimbap
-bread_pudding
-breakfast_burrito
-bruschetta
-caesar_salad
-cannoli
-caprese_salad
-carrot_cake
-ceviche
-cheesecake
-cheese_plate
-chicken_curry
-chicken_quesadilla
-chicken_wings
-chocolate_cake
-chocolate_mousse
-churros
-clam_chowder
-club_sandwich
-crab_cakes
-creme_brulee
-croque_madame
-cup_cakes
-deviled_eggs
-donuts
-dumplings
-edamame
-eggs_benedict
-escargots
-falafel
-filet_mignon
-fish_and_chips
-foie_gras
-french_fries
-french_onion_soup
-french_toast
-fried_calamari
-fried_rice
-frozen_yogurt
-garlic_bread
-gnocchi
-greek_salad
-grilled_cheese_sandwich
-grilled_salmon
-guacamole
-gyoza
-hamburger
-hot_and_sour_soup
-hot_dog
-huevos_rancheros
-hummus
-ice_cream
-lasagna
-lobster_bisque
-lobster_roll_sandwich
-macaroni_and_cheese
-macarons
-miso_soup
-mussels
-nachos
-omelette
-onion_rings
-oysters
-pad_thai
-paella
-pancakes
-panna_cotta
-peking_duck
-pho
-pizza
-pork_chop
-poutine
-prime_rib
-pulled_pork_sandwich
-ramen
-ravioli
-red_velvet_cake
-risotto
-samosa
-sashimi
-scallops
-seaweed_salad
-shrimp_and_grits
-spaghetti_bolognese
-spaghetti_carbonara
-spring_rolls
-steak
-strawberry_shortcake
-sushi
-tacos
-takoyaki
-tiramisu
-tuna_tartare
-waffles
diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py
deleted file mode 100644
index 3657116ae7a..00000000000
--- a/torchvision/prototype/datasets/_builtin/food101.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from pathlib import Path
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "food101"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class Food101(Dataset):
-    """Food 101 dataset
-    homepage="https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101",
-    """
-
-    def __init__(self, root: Union[str, Path], *, split: str = "train", skip_integrity_check: bool = False) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                url="http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz",
-                sha256="d97d15e438b7f4498f96086a4f7e2fa42a32f2712e87d3295441b2b6314053a4",
-                preprocess="decompress",
-            )
-        ]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = Path(data[0])
-        if path.parents[1].name == "images":
-            return 0
-        elif path.parents[0].name == "meta":
-            return 1
-        else:
-            return None
-
-    def _prepare_sample(self, data: Tuple[str, Tuple[str, BinaryIO]]) -> Dict[str, Any]:
-        id, (path, buffer) = data
-        return dict(
-            label=Label.from_category(id.split("/", 1)[0], categories=self._categories),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _image_key(self, data: Tuple[str, Any]) -> str:
-        path = Path(data[0])
-        return path.relative_to(path.parents[1]).with_suffix("").as_posix()
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        images_dp, split_dp = Demultiplexer(
-            archive_dp, 2, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-        )
-        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-        split_dp = LineReader(split_dp, decode=True, return_path=False)
-        split_dp = hint_sharding(split_dp)
-        split_dp = hint_shuffling(split_dp)
-
-        dp = IterKeyZipper(
-            split_dp,
-            images_dp,
-            key_fn=getitem(),
-            ref_key_fn=self._image_key,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        return Mapper(dp, self._prepare_sample)
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", "classes.txt"))
-        dp = LineReader(dp, decode=True, return_path=False)
-        return list(dp)
-
-    def __len__(self) -> int:
-        return 75_750 if self._split == "train" else 25_250
diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py
deleted file mode 100644
index 8dc0a8240c8..00000000000
--- a/torchvision/prototype/datasets/_builtin/gtsrb.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_comparator,
-)
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "gtsrb"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(
-        categories=[f"{label:05d}" for label in range(43)],
-    )
-
-
-@register_dataset(NAME)
-class GTSRB(Dataset):
-    """GTSRB Dataset
-
-    homepage="https://benchmark.ini.rub.de"
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "train", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_ROOT = "https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/"
-    _URLS = {
-        "train": f"{_URL_ROOT}GTSRB-Training_fixed.zip",
-        "test": f"{_URL_ROOT}GTSRB_Final_Test_Images.zip",
-        "test_ground_truth": f"{_URL_ROOT}GTSRB_Final_Test_GT.zip",
-    }
-    _CHECKSUMS = {
-        "train": "df4144942083645bd60b594de348aa6930126c3e0e5de09e39611630abf8455a",
-        "test": "48ba6fab7e877eb64eaf8de99035b0aaecfbc279bee23e35deca4ac1d0a837fa",
-        "test_ground_truth": "f94e5a7614d75845c74c04ddb26b8796b9e483f43541dd95dd5b726504e16d6d",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        rsrcs: List[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUMS[self._split])]
-
-        if self._split == "test":
-            rsrcs.append(
-                HttpResource(
-                    self._URLS["test_ground_truth"],
-                    sha256=self._CHECKSUMS["test_ground_truth"],
-                )
-            )
-
-        return rsrcs
-
-    def _classify_train_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        if path.suffix == ".ppm":
-            return 0
-        elif path.suffix == ".csv":
-            return 1
-        else:
-            return None
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Dict[str, Any]:
-        (path, buffer), csv_info = data
-        label = int(csv_info["ClassId"])
-
-        bounding_box = BoundingBox(
-            [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
-            format="xyxy",
-            image_size=(int(csv_info["Height"]), int(csv_info["Width"])),
-        )
-
-        return {
-            "path": path,
-            "image": EncodedImage.from_file(buffer),
-            "label": Label(label, categories=self._categories),
-            "bounding_box": bounding_box,
-        }
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        if self._split == "train":
-            images_dp, ann_dp = Demultiplexer(
-                resource_dps[0], 2, self._classify_train_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-            )
-        else:
-            images_dp, ann_dp = resource_dps
-            images_dp = Filter(images_dp, path_comparator("suffix", ".ppm"))
-
-        # The order of the image files in the .zip archives perfectly match the order of the entries in the
-        # (possibly concatenated) .csv files. So we're able to use Zipper here instead of a IterKeyZipper.
-        ann_dp = CSVDictParser(ann_dp, delimiter=";")
-        dp = Zipper(images_dp, ann_dp)
-
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 26_640 if self._split == "train" else 12_630
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.categories b/torchvision/prototype/datasets/_builtin/imagenet.categories
deleted file mode 100644
index 7b6006ff57f..00000000000
--- a/torchvision/prototype/datasets/_builtin/imagenet.categories
+++ /dev/null
@@ -1,1000 +0,0 @@
-tench,n01440764
-goldfish,n01443537
-great white shark,n01484850
-tiger shark,n01491361
-hammerhead,n01494475
-electric ray,n01496331
-stingray,n01498041
-cock,n01514668
-hen,n01514859
-ostrich,n01518878
-brambling,n01530575
-goldfinch,n01531178
-house finch,n01532829
-junco,n01534433
-indigo bunting,n01537544
-robin,n01558993
-bulbul,n01560419
-jay,n01580077
-magpie,n01582220
-chickadee,n01592084
-water ouzel,n01601694
-kite,n01608432
-bald eagle,n01614925
-vulture,n01616318
-great grey owl,n01622779
-European fire salamander,n01629819
-common newt,n01630670
-eft,n01631663
-spotted salamander,n01632458
-axolotl,n01632777
-bullfrog,n01641577
-tree frog,n01644373
-tailed frog,n01644900
-loggerhead,n01664065
-leatherback turtle,n01665541
-mud turtle,n01667114
-terrapin,n01667778
-box turtle,n01669191
-banded gecko,n01675722
-common iguana,n01677366
-American chameleon,n01682714
-whiptail,n01685808
-agama,n01687978
-frilled lizard,n01688243
-alligator lizard,n01689811
-Gila monster,n01692333
-green lizard,n01693334
-African chameleon,n01694178
-Komodo dragon,n01695060
-African crocodile,n01697457
-American alligator,n01698640
-triceratops,n01704323
-thunder snake,n01728572
-ringneck snake,n01728920
-hognose snake,n01729322
-green snake,n01729977
-king snake,n01734418
-garter snake,n01735189
-water snake,n01737021
-vine snake,n01739381
-night snake,n01740131
-boa constrictor,n01742172
-rock python,n01744401
-Indian cobra,n01748264
-green mamba,n01749939
-sea snake,n01751748
-horned viper,n01753488
-diamondback,n01755581
-sidewinder,n01756291
-trilobite,n01768244
-harvestman,n01770081
-scorpion,n01770393
-black and gold garden spider,n01773157
-barn spider,n01773549
-garden spider,n01773797
-black widow,n01774384
-tarantula,n01774750
-wolf spider,n01775062
-tick,n01776313
-centipede,n01784675
-black grouse,n01795545
-ptarmigan,n01796340
-ruffed grouse,n01797886
-prairie chicken,n01798484
-peacock,n01806143
-quail,n01806567
-partridge,n01807496
-African grey,n01817953
-macaw,n01818515
-sulphur-crested cockatoo,n01819313
-lorikeet,n01820546
-coucal,n01824575
-bee eater,n01828970
-hornbill,n01829413
-hummingbird,n01833805
-jacamar,n01843065
-toucan,n01843383
-drake,n01847000
-red-breasted merganser,n01855032
-goose,n01855672
-black swan,n01860187
-tusker,n01871265
-echidna,n01872401
-platypus,n01873310
-wallaby,n01877812
-koala,n01882714
-wombat,n01883070
-jellyfish,n01910747
-sea anemone,n01914609
-brain coral,n01917289
-flatworm,n01924916
-nematode,n01930112
-conch,n01943899
-snail,n01944390
-slug,n01945685
-sea slug,n01950731
-chiton,n01955084
-chambered nautilus,n01968897
-Dungeness crab,n01978287
-rock crab,n01978455
-fiddler crab,n01980166
-king crab,n01981276
-American lobster,n01983481
-spiny lobster,n01984695
-crayfish,n01985128
-hermit crab,n01986214
-isopod,n01990800
-white stork,n02002556
-black stork,n02002724
-spoonbill,n02006656
-flamingo,n02007558
-little blue heron,n02009229
-American egret,n02009912
-bittern,n02011460
-crane,n02012849
-limpkin,n02013706
-European gallinule,n02017213
-American coot,n02018207
-bustard,n02018795
-ruddy turnstone,n02025239
-red-backed sandpiper,n02027492
-redshank,n02028035
-dowitcher,n02033041
-oystercatcher,n02037110
-pelican,n02051845
-king penguin,n02056570
-albatross,n02058221
-grey whale,n02066245
-killer whale,n02071294
-dugong,n02074367
-sea lion,n02077923
-Chihuahua,n02085620
-Japanese spaniel,n02085782
-Maltese dog,n02085936
-Pekinese,n02086079
-Shih-Tzu,n02086240
-Blenheim spaniel,n02086646
-papillon,n02086910
-toy terrier,n02087046
-Rhodesian ridgeback,n02087394
-Afghan hound,n02088094
-basset,n02088238
-beagle,n02088364
-bloodhound,n02088466
-bluetick,n02088632
-black-and-tan coonhound,n02089078
-Walker hound,n02089867
-English foxhound,n02089973
-redbone,n02090379
-borzoi,n02090622
-Irish wolfhound,n02090721
-Italian greyhound,n02091032
-whippet,n02091134
-Ibizan hound,n02091244
-Norwegian elkhound,n02091467
-otterhound,n02091635
-Saluki,n02091831
-Scottish deerhound,n02092002
-Weimaraner,n02092339
-Staffordshire bullterrier,n02093256
-American Staffordshire terrier,n02093428
-Bedlington terrier,n02093647
-Border terrier,n02093754
-Kerry blue terrier,n02093859
-Irish terrier,n02093991
-Norfolk terrier,n02094114
-Norwich terrier,n02094258
-Yorkshire terrier,n02094433
-wire-haired fox terrier,n02095314
-Lakeland terrier,n02095570
-Sealyham terrier,n02095889
-Airedale,n02096051
-cairn,n02096177
-Australian terrier,n02096294
-Dandie Dinmont,n02096437
-Boston bull,n02096585
-miniature schnauzer,n02097047
-giant schnauzer,n02097130
-standard schnauzer,n02097209
-Scotch terrier,n02097298
-Tibetan terrier,n02097474
-silky terrier,n02097658
-soft-coated wheaten terrier,n02098105
-West Highland white terrier,n02098286
-Lhasa,n02098413
-flat-coated retriever,n02099267
-curly-coated retriever,n02099429
-golden retriever,n02099601
-Labrador retriever,n02099712
-Chesapeake Bay retriever,n02099849
-German short-haired pointer,n02100236
-vizsla,n02100583
-English setter,n02100735
-Irish setter,n02100877
-Gordon setter,n02101006
-Brittany spaniel,n02101388
-clumber,n02101556
-English springer,n02102040
-Welsh springer spaniel,n02102177
-cocker spaniel,n02102318
-Sussex spaniel,n02102480
-Irish water spaniel,n02102973
-kuvasz,n02104029
-schipperke,n02104365
-groenendael,n02105056
-malinois,n02105162
-briard,n02105251
-kelpie,n02105412
-komondor,n02105505
-Old English sheepdog,n02105641
-Shetland sheepdog,n02105855
-collie,n02106030
-Border collie,n02106166
-Bouvier des Flandres,n02106382
-Rottweiler,n02106550
-German shepherd,n02106662
-Doberman,n02107142
-miniature pinscher,n02107312
-Greater Swiss Mountain dog,n02107574
-Bernese mountain dog,n02107683
-Appenzeller,n02107908
-EntleBucher,n02108000
-boxer,n02108089
-bull mastiff,n02108422
-Tibetan mastiff,n02108551
-French bulldog,n02108915
-Great Dane,n02109047
-Saint Bernard,n02109525
-Eskimo dog,n02109961
-malamute,n02110063
-Siberian husky,n02110185
-dalmatian,n02110341
-affenpinscher,n02110627
-basenji,n02110806
-pug,n02110958
-Leonberg,n02111129
-Newfoundland,n02111277
-Great Pyrenees,n02111500
-Samoyed,n02111889
-Pomeranian,n02112018
-chow,n02112137
-keeshond,n02112350
-Brabancon griffon,n02112706
-Pembroke,n02113023
-Cardigan,n02113186
-toy poodle,n02113624
-miniature poodle,n02113712
-standard poodle,n02113799
-Mexican hairless,n02113978
-timber wolf,n02114367
-white wolf,n02114548
-red wolf,n02114712
-coyote,n02114855
-dingo,n02115641
-dhole,n02115913
-African hunting dog,n02116738
-hyena,n02117135
-red fox,n02119022
-kit fox,n02119789
-Arctic fox,n02120079
-grey fox,n02120505
-tabby,n02123045
-tiger cat,n02123159
-Persian cat,n02123394
-Siamese cat,n02123597
-Egyptian cat,n02124075
-cougar,n02125311
-lynx,n02127052
-leopard,n02128385
-snow leopard,n02128757
-jaguar,n02128925
-lion,n02129165
-tiger,n02129604
-cheetah,n02130308
-brown bear,n02132136
-American black bear,n02133161
-ice bear,n02134084
-sloth bear,n02134418
-mongoose,n02137549
-meerkat,n02138441
-tiger beetle,n02165105
-ladybug,n02165456
-ground beetle,n02167151
-long-horned beetle,n02168699
-leaf beetle,n02169497
-dung beetle,n02172182
-rhinoceros beetle,n02174001
-weevil,n02177972
-fly,n02190166
-bee,n02206856
-ant,n02219486
-grasshopper,n02226429
-cricket,n02229544
-walking stick,n02231487
-cockroach,n02233338
-mantis,n02236044
-cicada,n02256656
-leafhopper,n02259212
-lacewing,n02264363
-dragonfly,n02268443
-damselfly,n02268853
-admiral,n02276258
-ringlet,n02277742
-monarch,n02279972
-cabbage butterfly,n02280649
-sulphur butterfly,n02281406
-lycaenid,n02281787
-starfish,n02317335
-sea urchin,n02319095
-sea cucumber,n02321529
-wood rabbit,n02325366
-hare,n02326432
-Angora,n02328150
-hamster,n02342885
-porcupine,n02346627
-fox squirrel,n02356798
-marmot,n02361337
-beaver,n02363005
-guinea pig,n02364673
-sorrel,n02389026
-zebra,n02391049
-hog,n02395406
-wild boar,n02396427
-warthog,n02397096
-hippopotamus,n02398521
-ox,n02403003
-water buffalo,n02408429
-bison,n02410509
-ram,n02412080
-bighorn,n02415577
-ibex,n02417914
-hartebeest,n02422106
-impala,n02422699
-gazelle,n02423022
-Arabian camel,n02437312
-llama,n02437616
-weasel,n02441942
-mink,n02442845
-polecat,n02443114
-black-footed ferret,n02443484
-otter,n02444819
-skunk,n02445715
-badger,n02447366
-armadillo,n02454379
-three-toed sloth,n02457408
-orangutan,n02480495
-gorilla,n02480855
-chimpanzee,n02481823
-gibbon,n02483362
-siamang,n02483708
-guenon,n02484975
-patas,n02486261
-baboon,n02486410
-macaque,n02487347
-langur,n02488291
-colobus,n02488702
-proboscis monkey,n02489166
-marmoset,n02490219
-capuchin,n02492035
-howler monkey,n02492660
-titi,n02493509
-spider monkey,n02493793
-squirrel monkey,n02494079
-Madagascar cat,n02497673
-indri,n02500267
-Indian elephant,n02504013
-African elephant,n02504458
-lesser panda,n02509815
-giant panda,n02510455
-barracouta,n02514041
-eel,n02526121
-coho,n02536864
-rock beauty,n02606052
-anemone fish,n02607072
-sturgeon,n02640242
-gar,n02641379
-lionfish,n02643566
-puffer,n02655020
-abacus,n02666196
-abaya,n02667093
-academic gown,n02669723
-accordion,n02672831
-acoustic guitar,n02676566
-aircraft carrier,n02687172
-airliner,n02690373
-airship,n02692877
-altar,n02699494
-ambulance,n02701002
-amphibian,n02704792
-analog clock,n02708093
-apiary,n02727426
-apron,n02730930
-ashcan,n02747177
-assault rifle,n02749479
-backpack,n02769748
-bakery,n02776631
-balance beam,n02777292
-balloon,n02782093
-ballpoint,n02783161
-Band Aid,n02786058
-banjo,n02787622
-bannister,n02788148
-barbell,n02790996
-barber chair,n02791124
-barbershop,n02791270
-barn,n02793495
-barometer,n02794156
-barrel,n02795169
-barrow,n02797295
-baseball,n02799071
-basketball,n02802426
-bassinet,n02804414
-bassoon,n02804610
-bathing cap,n02807133
-bath towel,n02808304
-bathtub,n02808440
-beach wagon,n02814533
-beacon,n02814860
-beaker,n02815834
-bearskin,n02817516
-beer bottle,n02823428
-beer glass,n02823750
-bell cote,n02825657
-bib,n02834397
-bicycle-built-for-two,n02835271
-bikini,n02837789
-binder,n02840245
-binoculars,n02841315
-birdhouse,n02843684
-boathouse,n02859443
-bobsled,n02860847
-bolo tie,n02865351
-bonnet,n02869837
-bookcase,n02870880
-bookshop,n02871525
-bottlecap,n02877765
-bow,n02879718
-bow tie,n02883205
-brass,n02892201
-brassiere,n02892767
-breakwater,n02894605
-breastplate,n02895154
-broom,n02906734
-bucket,n02909870
-buckle,n02910353
-bulletproof vest,n02916936
-bullet train,n02917067
-butcher shop,n02927161
-cab,n02930766
-caldron,n02939185
-candle,n02948072
-cannon,n02950826
-canoe,n02951358
-can opener,n02951585
-cardigan,n02963159
-car mirror,n02965783
-carousel,n02966193
-carpenter's kit,n02966687
-carton,n02971356
-car wheel,n02974003
-cash machine,n02977058
-cassette,n02978881
-cassette player,n02979186
-castle,n02980441
-catamaran,n02981792
-CD player,n02988304
-cello,n02992211
-cellular telephone,n02992529
-chain,n02999410
-chainlink fence,n03000134
-chain mail,n03000247
-chain saw,n03000684
-chest,n03014705
-chiffonier,n03016953
-chime,n03017168
-china cabinet,n03018349
-Christmas stocking,n03026506
-church,n03028079
-cinema,n03032252
-cleaver,n03041632
-cliff dwelling,n03042490
-cloak,n03045698
-clog,n03047690
-cocktail shaker,n03062245
-coffee mug,n03063599
-coffeepot,n03063689
-coil,n03065424
-combination lock,n03075370
-computer keyboard,n03085013
-confectionery,n03089624
-container ship,n03095699
-convertible,n03100240
-corkscrew,n03109150
-cornet,n03110669
-cowboy boot,n03124043
-cowboy hat,n03124170
-cradle,n03125729
-construction crane,n03126707
-crash helmet,n03127747
-crate,n03127925
-crib,n03131574
-Crock Pot,n03133878
-croquet ball,n03134739
-crutch,n03141823
-cuirass,n03146219
-dam,n03160309
-desk,n03179701
-desktop computer,n03180011
-dial telephone,n03187595
-diaper,n03188531
-digital clock,n03196217
-digital watch,n03197337
-dining table,n03201208
-dishrag,n03207743
-dishwasher,n03207941
-disk brake,n03208938
-dock,n03216828
-dogsled,n03218198
-dome,n03220513
-doormat,n03223299
-drilling platform,n03240683
-drum,n03249569
-drumstick,n03250847
-dumbbell,n03255030
-Dutch oven,n03259280
-electric fan,n03271574
-electric guitar,n03272010
-electric locomotive,n03272562
-entertainment center,n03290653
-envelope,n03291819
-espresso maker,n03297495
-face powder,n03314780
-feather boa,n03325584
-file,n03337140
-fireboat,n03344393
-fire engine,n03345487
-fire screen,n03347037
-flagpole,n03355925
-flute,n03372029
-folding chair,n03376595
-football helmet,n03379051
-forklift,n03384352
-fountain,n03388043
-fountain pen,n03388183
-four-poster,n03388549
-freight car,n03393912
-French horn,n03394916
-frying pan,n03400231
-fur coat,n03404251
-garbage truck,n03417042
-gasmask,n03424325
-gas pump,n03425413
-goblet,n03443371
-go-kart,n03444034
-golf ball,n03445777
-golfcart,n03445924
-gondola,n03447447
-gong,n03447721
-gown,n03450230
-grand piano,n03452741
-greenhouse,n03457902
-grille,n03459775
-grocery store,n03461385
-guillotine,n03467068
-hair slide,n03476684
-hair spray,n03476991
-half track,n03478589
-hammer,n03481172
-hamper,n03482405
-hand blower,n03483316
-hand-held computer,n03485407
-handkerchief,n03485794
-hard disc,n03492542
-harmonica,n03494278
-harp,n03495258
-harvester,n03496892
-hatchet,n03498962
-holster,n03527444
-home theater,n03529860
-honeycomb,n03530642
-hook,n03532672
-hoopskirt,n03534580
-horizontal bar,n03535780
-horse cart,n03538406
-hourglass,n03544143
-iPod,n03584254
-iron,n03584829
-jack-o'-lantern,n03590841
-jean,n03594734
-jeep,n03594945
-jersey,n03595614
-jigsaw puzzle,n03598930
-jinrikisha,n03599486
-joystick,n03602883
-kimono,n03617480
-knee pad,n03623198
-knot,n03627232
-lab coat,n03630383
-ladle,n03633091
-lampshade,n03637318
-laptop,n03642806
-lawn mower,n03649909
-lens cap,n03657121
-letter opener,n03658185
-library,n03661043
-lifeboat,n03662601
-lighter,n03666591
-limousine,n03670208
-liner,n03673027
-lipstick,n03676483
-Loafer,n03680355
-lotion,n03690938
-loudspeaker,n03691459
-loupe,n03692522
-lumbermill,n03697007
-magnetic compass,n03706229
-mailbag,n03709823
-mailbox,n03710193
-maillot,n03710637
-tank suit,n03710721
-manhole cover,n03717622
-maraca,n03720891
-marimba,n03721384
-mask,n03724870
-matchstick,n03729826
-maypole,n03733131
-maze,n03733281
-measuring cup,n03733805
-medicine chest,n03742115
-megalith,n03743016
-microphone,n03759954
-microwave,n03761084
-military uniform,n03763968
-milk can,n03764736
-minibus,n03769881
-miniskirt,n03770439
-minivan,n03770679
-missile,n03773504
-mitten,n03775071
-mixing bowl,n03775546
-mobile home,n03776460
-Model T,n03777568
-modem,n03777754
-monastery,n03781244
-monitor,n03782006
-moped,n03785016
-mortar,n03786901
-mortarboard,n03787032
-mosque,n03788195
-mosquito net,n03788365
-motor scooter,n03791053
-mountain bike,n03792782
-mountain tent,n03792972
-mouse,n03793489
-mousetrap,n03794056
-moving van,n03796401
-muzzle,n03803284
-nail,n03804744
-neck brace,n03814639
-necklace,n03814906
-nipple,n03825788
-notebook,n03832673
-obelisk,n03837869
-oboe,n03838899
-ocarina,n03840681
-odometer,n03841143
-oil filter,n03843555
-organ,n03854065
-oscilloscope,n03857828
-overskirt,n03866082
-oxcart,n03868242
-oxygen mask,n03868863
-packet,n03871628
-paddle,n03873416
-paddlewheel,n03874293
-padlock,n03874599
-paintbrush,n03876231
-pajama,n03877472
-palace,n03877845
-panpipe,n03884397
-paper towel,n03887697
-parachute,n03888257
-parallel bars,n03888605
-park bench,n03891251
-parking meter,n03891332
-passenger car,n03895866
-patio,n03899768
-pay-phone,n03902125
-pedestal,n03903868
-pencil box,n03908618
-pencil sharpener,n03908714
-perfume,n03916031
-Petri dish,n03920288
-photocopier,n03924679
-pick,n03929660
-pickelhaube,n03929855
-picket fence,n03930313
-pickup,n03930630
-pier,n03933933
-piggy bank,n03935335
-pill bottle,n03937543
-pillow,n03938244
-ping-pong ball,n03942813
-pinwheel,n03944341
-pirate,n03947888
-pitcher,n03950228
-plane,n03954731
-planetarium,n03956157
-plastic bag,n03958227
-plate rack,n03961711
-plow,n03967562
-plunger,n03970156
-Polaroid camera,n03976467
-pole,n03976657
-police van,n03977966
-poncho,n03980874
-pool table,n03982430
-pop bottle,n03983396
-pot,n03991062
-potter's wheel,n03992509
-power drill,n03995372
-prayer rug,n03998194
-printer,n04004767
-prison,n04005630
-projectile,n04008634
-projector,n04009552
-puck,n04019541
-punching bag,n04023962
-purse,n04026417
-quill,n04033901
-quilt,n04033995
-racer,n04037443
-racket,n04039381
-radiator,n04040759
-radio,n04041544
-radio telescope,n04044716
-rain barrel,n04049303
-recreational vehicle,n04065272
-reel,n04067472
-reflex camera,n04069434
-refrigerator,n04070727
-remote control,n04074963
-restaurant,n04081281
-revolver,n04086273
-rifle,n04090263
-rocking chair,n04099969
-rotisserie,n04111531
-rubber eraser,n04116512
-rugby ball,n04118538
-rule,n04118776
-running shoe,n04120489
-safe,n04125021
-safety pin,n04127249
-saltshaker,n04131690
-sandal,n04133789
-sarong,n04136333
-sax,n04141076
-scabbard,n04141327
-scale,n04141975
-school bus,n04146614
-schooner,n04147183
-scoreboard,n04149813
-screen,n04152593
-screw,n04153751
-screwdriver,n04154565
-seat belt,n04162706
-sewing machine,n04179913
-shield,n04192698
-shoe shop,n04200800
-shoji,n04201297
-shopping basket,n04204238
-shopping cart,n04204347
-shovel,n04208210
-shower cap,n04209133
-shower curtain,n04209239
-ski,n04228054
-ski mask,n04229816
-sleeping bag,n04235860
-slide rule,n04238763
-sliding door,n04239074
-slot,n04243546
-snorkel,n04251144
-snowmobile,n04252077
-snowplow,n04252225
-soap dispenser,n04254120
-soccer ball,n04254680
-sock,n04254777
-solar dish,n04258138
-sombrero,n04259630
-soup bowl,n04263257
-space bar,n04264628
-space heater,n04265275
-space shuttle,n04266014
-spatula,n04270147
-speedboat,n04273569
-spider web,n04275548
-spindle,n04277352
-sports car,n04285008
-spotlight,n04286575
-stage,n04296562
-steam locomotive,n04310018
-steel arch bridge,n04311004
-steel drum,n04311174
-stethoscope,n04317175
-stole,n04325704
-stone wall,n04326547
-stopwatch,n04328186
-stove,n04330267
-strainer,n04332243
-streetcar,n04335435
-stretcher,n04336792
-studio couch,n04344873
-stupa,n04346328
-submarine,n04347754
-suit,n04350905
-sundial,n04355338
-sunglass,n04355933
-sunglasses,n04356056
-sunscreen,n04357314
-suspension bridge,n04366367
-swab,n04367480
-sweatshirt,n04370456
-swimming trunks,n04371430
-swing,n04371774
-switch,n04372370
-syringe,n04376876
-table lamp,n04380533
-tank,n04389033
-tape player,n04392985
-teapot,n04398044
-teddy,n04399382
-television,n04404412
-tennis ball,n04409515
-thatch,n04417672
-theater curtain,n04418357
-thimble,n04423845
-thresher,n04428191
-throne,n04429376
-tile roof,n04435653
-toaster,n04442312
-tobacco shop,n04443257
-toilet seat,n04447861
-torch,n04456115
-totem pole,n04458633
-tow truck,n04461696
-toyshop,n04462240
-tractor,n04465501
-trailer truck,n04467665
-tray,n04476259
-trench coat,n04479046
-tricycle,n04482393
-trimaran,n04483307
-tripod,n04485082
-triumphal arch,n04486054
-trolleybus,n04487081
-trombone,n04487394
-tub,n04493381
-turnstile,n04501370
-typewriter keyboard,n04505470
-umbrella,n04507155
-unicycle,n04509417
-upright,n04515003
-vacuum,n04517823
-vase,n04522168
-vault,n04523525
-velvet,n04525038
-vending machine,n04525305
-vestment,n04532106
-viaduct,n04532670
-violin,n04536866
-volleyball,n04540053
-waffle iron,n04542943
-wall clock,n04548280
-wallet,n04548362
-wardrobe,n04550184
-warplane,n04552348
-washbasin,n04553703
-washer,n04554684
-water bottle,n04557648
-water jug,n04560804
-water tower,n04562935
-whiskey jug,n04579145
-whistle,n04579432
-wig,n04584207
-window screen,n04589890
-window shade,n04590129
-Windsor tie,n04591157
-wine bottle,n04591713
-wing,n04592741
-wok,n04596742
-wooden spoon,n04597913
-wool,n04599235
-worm fence,n04604644
-wreck,n04606251
-yawl,n04612504
-yurt,n04613696
-web site,n06359193
-comic book,n06596364
-crossword puzzle,n06785654
-street sign,n06794110
-traffic light,n06874185
-book jacket,n07248320
-menu,n07565083
-plate,n07579787
-guacamole,n07583066
-consomme,n07584110
-hot pot,n07590611
-trifle,n07613480
-ice cream,n07614500
-ice lolly,n07615774
-French loaf,n07684084
-bagel,n07693725
-pretzel,n07695742
-cheeseburger,n07697313
-hotdog,n07697537
-mashed potato,n07711569
-head cabbage,n07714571
-broccoli,n07714990
-cauliflower,n07715103
-zucchini,n07716358
-spaghetti squash,n07716906
-acorn squash,n07717410
-butternut squash,n07717556
-cucumber,n07718472
-artichoke,n07718747
-bell pepper,n07720875
-cardoon,n07730033
-mushroom,n07734744
-Granny Smith,n07742313
-strawberry,n07745940
-orange,n07747607
-lemon,n07749582
-fig,n07753113
-pineapple,n07753275
-banana,n07753592
-jackfruit,n07754684
-custard apple,n07760859
-pomegranate,n07768694
-hay,n07802026
-carbonara,n07831146
-chocolate sauce,n07836838
-dough,n07860988
-meat loaf,n07871810
-pizza,n07873807
-potpie,n07875152
-burrito,n07880968
-red wine,n07892512
-espresso,n07920052
-cup,n07930864
-eggnog,n07932039
-alp,n09193705
-bubble,n09229709
-cliff,n09246464
-coral reef,n09256479
-geyser,n09288635
-lakeside,n09332890
-promontory,n09399592
-sandbar,n09421951
-seashore,n09428293
-valley,n09468604
-volcano,n09472597
-ballplayer,n09835506
-groom,n10148035
-scuba diver,n10565667
-rapeseed,n11879895
-daisy,n11939491
-yellow lady's slipper,n12057211
-corn,n12144580
-acorn,n12267677
-hip,n12620546
-buckeye,n12768682
-coral fungus,n12985857
-agaric,n12998815
-gyromitra,n13037406
-stinkhorn,n13040303
-earthstar,n13044778
-hen-of-the-woods,n13052670
-bolete,n13054560
-ear,n13133613
-toilet tissue,n15075141
diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py
deleted file mode 100644
index 3192f1f5503..00000000000
--- a/torchvision/prototype/datasets/_builtin/imagenet.py
+++ /dev/null
@@ -1,223 +0,0 @@
-import enum
-import pathlib
-import re
-
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Match, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import (
-    Demultiplexer,
-    Enumerator,
-    Filter,
-    IterDataPipe,
-    IterKeyZipper,
-    LineReader,
-    Mapper,
-    TarArchiveLoader,
-)
-from torchdata.datapipes.map import IterToMapConverter
-from torchvision.prototype.datasets.utils import Dataset, ManualDownloadResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "imagenet"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    categories, wnids = zip(*read_categories_file(NAME))
-    return dict(categories=categories, wnids=wnids)
-
-
-class ImageNetResource(ManualDownloadResource):
-    def __init__(self, **kwargs: Any) -> None:
-        super().__init__("Register on https://image-net.org/ and follow the instructions there.", **kwargs)
-
-
-class ImageNetDemux(enum.IntEnum):
-    META = 0
-    LABEL = 1
-
-
-class CategoryAndWordNetIDExtractor(IterDataPipe):
-    # Although the WordNet IDs (wnids) are unique, the corresponding categories are not. For example, both n02012849
-    # and n03126707 are labeled 'crane' while the first means the bird and the latter means the construction equipment
-    _WNID_MAP = {
-        "n03126707": "construction crane",
-        "n03710721": "tank suit",
-    }
-
-    def __init__(self, datapipe: IterDataPipe[Tuple[str, BinaryIO]]) -> None:
-        self.datapipe = datapipe
-
-    def __iter__(self) -> Iterator[Tuple[str, str]]:
-        for _, stream in self.datapipe:
-            synsets = read_mat(stream, squeeze_me=True)["synsets"]
-            for _, wnid, category, _, num_children, *_ in synsets:
-                if num_children > 0:
-                    # we are looking at a superclass that has no direct instance
-                    continue
-
-                yield self._WNID_MAP.get(wnid, category.split(",", 1)[0]), wnid
-
-
-@register_dataset(NAME)
-class ImageNet(Dataset):
-    """
-    - **homepage**: https://www.image-net.org/
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val", "test"})
-
-        info = _info()
-        categories, wnids = info["categories"], info["wnids"]
-        self._categories = categories
-        self._wnids = wnids
-        self._wnid_to_category = dict(zip(wnids, categories))
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _IMAGES_CHECKSUMS = {
-        "train": "b08200a27a8e34218a0e58fde36b0fe8f73bc377f4acea2d91602057c3ca45bb",
-        "val": "c7e06a6c0baccf06d8dbeb6577d71efff84673a5dbdd50633ab44f8ea0456ae0",
-        "test_v10102019": "9cf7f8249639510f17d3d8a0deb47cd22a435886ba8e29e2b3223e65a4079eb4",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        name = "test_v10102019" if self._split == "test" else self._split
-        images = ImageNetResource(
-            file_name=f"ILSVRC2012_img_{name}.tar",
-            sha256=self._IMAGES_CHECKSUMS[name],
-        )
-        resources: List[OnlineResource] = [images]
-
-        if self._split == "val":
-            devkit = ImageNetResource(
-                file_name="ILSVRC2012_devkit_t12.tar.gz",
-                sha256="b59243268c0d266621fd587d2018f69e906fb22875aca0e295b48cafaa927953",
-            )
-            resources.append(devkit)
-
-        return resources
-
-    _TRAIN_IMAGE_NAME_PATTERN = re.compile(r"(?P<wnid>n\d{8})_\d+[.]JPEG")
-
-    def _prepare_train_data(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[Label, str], Tuple[str, BinaryIO]]:
-        path = pathlib.Path(data[0])
-        wnid = cast(Match[str], self._TRAIN_IMAGE_NAME_PATTERN.match(path.name))["wnid"]
-        label = Label.from_category(self._wnid_to_category[wnid], categories=self._categories)
-        return (label, wnid), data
-
-    def _prepare_test_data(self, data: Tuple[str, BinaryIO]) -> Tuple[None, Tuple[str, BinaryIO]]:
-        return None, data
-
-    def _classifiy_devkit(self, data: Tuple[str, BinaryIO]) -> Optional[int]:
-        return {
-            "meta.mat": ImageNetDemux.META,
-            "ILSVRC2012_validation_ground_truth.txt": ImageNetDemux.LABEL,
-        }.get(pathlib.Path(data[0]).name)
-
-    _VAL_TEST_IMAGE_NAME_PATTERN = re.compile(r"ILSVRC2012_(val|test)_(?P<id>\d{8})[.]JPEG")
-
-    def _val_test_image_key(self, path: pathlib.Path) -> int:
-        return int(self._VAL_TEST_IMAGE_NAME_PATTERN.match(path.name)["id"])  # type: ignore[index]
-
-    def _prepare_val_data(
-        self, data: Tuple[Tuple[int, str], Tuple[str, BinaryIO]]
-    ) -> Tuple[Tuple[Label, str], Tuple[str, BinaryIO]]:
-        label_data, image_data = data
-        _, wnid = label_data
-        label = Label.from_category(self._wnid_to_category[wnid], categories=self._categories)
-        return (label, wnid), image_data
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Optional[Tuple[Label, str]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
-        label_data, (path, buffer) = data
-
-        return dict(
-            dict(zip(("label", "wnid"), label_data if label_data else (None, None))),
-            path=path,
-            image=EncodedImage.from_file(buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        if self._split in {"train", "test"}:
-            dp = resource_dps[0]
-
-            # the train archive is a tar of tars
-            if self._split == "train":
-                dp = TarArchiveLoader(dp)
-
-            dp = hint_shuffling(dp)
-            dp = hint_sharding(dp)
-            dp = Mapper(dp, self._prepare_train_data if self._split == "train" else self._prepare_test_data)
-        else:  # config.split == "val":
-            images_dp, devkit_dp = resource_dps
-
-            meta_dp, label_dp = Demultiplexer(
-                devkit_dp, 2, self._classifiy_devkit, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE
-            )
-
-            # We cannot use self._wnids here, since we use a different order than the dataset
-            meta_dp = CategoryAndWordNetIDExtractor(meta_dp)
-            wnid_dp = Mapper(meta_dp, getitem(1))
-            wnid_dp = Enumerator(wnid_dp, 1)
-            wnid_map = IterToMapConverter(wnid_dp)
-
-            label_dp = LineReader(label_dp, decode=True, return_path=False)
-            label_dp = Mapper(label_dp, int)
-            label_dp = Mapper(label_dp, wnid_map.__getitem__)
-            label_dp: IterDataPipe[Tuple[int, str]] = Enumerator(label_dp, 1)
-            label_dp = hint_shuffling(label_dp)
-            label_dp = hint_sharding(label_dp)
-
-            dp = IterKeyZipper(
-                label_dp,
-                images_dp,
-                key_fn=getitem(0),
-                ref_key_fn=path_accessor(self._val_test_image_key),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-            dp = Mapper(dp, self._prepare_val_data)
-
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 1_281_167,
-            "val": 50_000,
-            "test": 100_000,
-        }[self._split]
-
-    def _filter_meta(self, data: Tuple[str, Any]) -> bool:
-        return self._classifiy_devkit(data) == ImageNetDemux.META
-
-    def _generate_categories(self) -> List[Tuple[str, ...]]:
-        self._split = "val"
-        resources = self._resources()
-
-        devkit_dp = resources[1].load(self._root)
-        meta_dp = Filter(devkit_dp, self._filter_meta)
-        meta_dp = CategoryAndWordNetIDExtractor(meta_dp)
-
-        categories_and_wnids = cast(List[Tuple[str, ...]], list(meta_dp))
-        categories_and_wnids.sort(key=lambda category_and_wnid: category_and_wnid[1])
-        return categories_and_wnids
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
deleted file mode 100644
index 7a459b2d0ea..00000000000
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ /dev/null
@@ -1,415 +0,0 @@
-import abc
-import functools
-import operator
-import pathlib
-import string
-from typing import Any, BinaryIO, cast, Dict, Iterator, List, Optional, Sequence, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE
-from torchvision.prototype.features import Image, Label
-from torchvision.prototype.utils._internal import fromfile
-
-from .._api import register_dataset, register_info
-
-
-prod = functools.partial(functools.reduce, operator.mul)
-
-
-class MNISTFileReader(IterDataPipe[torch.Tensor]):
-    _DTYPE_MAP = {
-        8: torch.uint8,
-        9: torch.int8,
-        11: torch.int16,
-        12: torch.int32,
-        13: torch.float32,
-        14: torch.float64,
-    }
-
-    def __init__(
-        self, datapipe: IterDataPipe[Tuple[Any, BinaryIO]], *, start: Optional[int], stop: Optional[int]
-    ) -> None:
-        self.datapipe = datapipe
-        self.start = start
-        self.stop = stop
-
-    def __iter__(self) -> Iterator[torch.Tensor]:
-        for _, file in self.datapipe:
-            read = functools.partial(fromfile, file, byte_order="big")
-
-            magic = int(read(dtype=torch.int32, count=1))
-            dtype = self._DTYPE_MAP[magic // 256]
-            ndim = magic % 256 - 1
-
-            num_samples = int(read(dtype=torch.int32, count=1))
-            shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
-            count = prod(shape) if shape else 1
-
-            start = self.start or 0
-            stop = min(self.stop, num_samples) if self.stop else num_samples
-
-            if start:
-                num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-                file.seek(num_bytes_per_value * count * start, 1)
-
-            for _ in range(stop - start):
-                yield read(dtype=dtype, count=count).reshape(shape)
-
-
-class _MNISTBase(Dataset):
-    _URL_BASE: Union[str, Sequence[str]]
-
-    @abc.abstractmethod
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        pass
-
-    def _resources(self) -> List[OnlineResource]:
-        (images_file, images_sha256), (
-            labels_file,
-            labels_sha256,
-        ) = self._files_and_checksums()
-
-        url_bases = self._URL_BASE
-        if isinstance(url_bases, str):
-            url_bases = (url_bases,)
-
-        images_urls = [f"{url_base}/{images_file}" for url_base in url_bases]
-        images = HttpResource(images_urls[0], sha256=images_sha256, mirrors=images_urls[1:])
-
-        labels_urls = [f"{url_base}/{labels_file}" for url_base in url_bases]
-        labels = HttpResource(labels_urls[0], sha256=labels_sha256, mirrors=labels_urls[1:])
-
-        return [images, labels]
-
-    def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
-        return None, None
-
-    _categories: List[str]
-
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
-        image, label = data
-        return dict(
-            image=Image(image),
-            label=Label(label, dtype=torch.int64, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, labels_dp = resource_dps
-        start, stop = self.start_and_stop()
-
-        images_dp = Decompressor(images_dp)
-        images_dp = MNISTFileReader(images_dp, start=start, stop=stop)
-
-        labels_dp = Decompressor(labels_dp)
-        labels_dp = MNISTFileReader(labels_dp, start=start, stop=stop)
-
-        dp = Zipper(images_dp, labels_dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-
-@register_info("mnist")
-def _mnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=[str(label) for label in range(10)],
-    )
-
-
-@register_dataset("mnist")
-class MNIST(_MNISTBase):
-    """
-    - **homepage**: http://yann.lecun.com/exdb/mnist
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_BASE: Union[str, Sequence[str]] = (
-        "http://yann.lecun.com/exdb/mnist",
-        "https://ossci-datasets.s3.amazonaws.com/mnist",
-    )
-    _CHECKSUMS = {
-        "train-images-idx3-ubyte.gz": "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609",
-        "train-labels-idx1-ubyte.gz": "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c",
-        "t10k-images-idx3-ubyte.gz": "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6",
-        "t10k-labels-idx1-ubyte.gz": "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6",
-    }
-
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        prefix = "train" if self._split == "train" else "t10k"
-        images_file = f"{prefix}-images-idx3-ubyte.gz"
-        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
-        return (images_file, self._CHECKSUMS[images_file]), (
-            labels_file,
-            self._CHECKSUMS[labels_file],
-        )
-
-    _categories = _mnist_info()["categories"]
-
-    def __len__(self) -> int:
-        return 60_000 if self._split == "train" else 10_000
-
-
-@register_info("fashionmnist")
-def _fashionmnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=[
-            "T-shirt/top",
-            "Trouser",
-            "Pullover",
-            "Dress",
-            "Coat",
-            "Sandal",
-            "Shirt",
-            "Sneaker",
-            "Bag",
-            "Ankle boot",
-        ],
-    )
-
-
-@register_dataset("fashionmnist")
-class FashionMNIST(MNIST):
-    """
-    - **homepage**: https://github.com/zalandoresearch/fashion-mnist
-    """
-
-    _URL_BASE = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com"
-    _CHECKSUMS = {
-        "train-images-idx3-ubyte.gz": "3aede38d61863908ad78613f6a32ed271626dd12800ba2636569512369268a84",
-        "train-labels-idx1-ubyte.gz": "a04f17134ac03560a47e3764e11b92fc97de4d1bfaf8ba1a3aa29af54cc90845",
-        "t10k-images-idx3-ubyte.gz": "346e55b948d973a97e58d2351dde16a484bd415d4595297633bb08f03db6a073",
-        "t10k-labels-idx1-ubyte.gz": "67da17c76eaffca5446c3361aaab5c3cd6d1c2608764d35dfb1850b086bf8dd5",
-    }
-
-    _categories = _fashionmnist_info()["categories"]
-
-
-@register_info("kmnist")
-def _kmnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=["o", "ki", "su", "tsu", "na", "ha", "ma", "ya", "re", "wo"],
-    )
-
-
-@register_dataset("kmnist")
-class KMNIST(MNIST):
-    """
-    - **homepage**: http://codh.rois.ac.jp/kmnist/index.html.en
-    """
-
-    _URL_BASE = "http://codh.rois.ac.jp/kmnist/dataset/kmnist"
-    _CHECKSUMS = {
-        "train-images-idx3-ubyte.gz": "51467d22d8cc72929e2a028a0428f2086b092bb31cfb79c69cc0a90ce135fde4",
-        "train-labels-idx1-ubyte.gz": "e38f9ebcd0f3ebcdec7fc8eabdcdaef93bb0df8ea12bee65224341c8183d8e17",
-        "t10k-images-idx3-ubyte.gz": "edd7a857845ad6bb1d0ba43fe7e794d164fe2dce499a1694695a792adfac43c5",
-        "t10k-labels-idx1-ubyte.gz": "20bb9a0ef54c7db3efc55a92eef5582c109615df22683c380526788f98e42a1c",
-    }
-
-    _categories = _kmnist_info()["categories"]
-
-
-@register_info("emnist")
-def _emnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=list(string.digits + string.ascii_uppercase + string.ascii_lowercase),
-    )
-
-
-@register_dataset("emnist")
-class EMNIST(_MNISTBase):
-    """
-    - **homepage**: https://www.westernsydney.edu.au/icns/reproducible_research/publication_support_materials/emnist
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        image_set: str = "Balanced",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test"))
-        self._image_set = self._verify_str_arg(
-            image_set, "image_set", ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST")
-        )
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_BASE = "https://rds.westernsydney.edu.au/Institutes/MARCS/BENS/EMNIST"
-
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        prefix = f"emnist-{self._image_set.replace('_', '').lower()}-{self._split}"
-        images_file = f"{prefix}-images-idx3-ubyte.gz"
-        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
-        # Since EMNIST provides the data files inside an archive, we don't need to provide checksums for them
-        return (images_file, ""), (labels_file, "")
-
-    def _resources(self) -> List[OnlineResource]:
-        return [
-            HttpResource(
-                f"{self._URL_BASE}/emnist-gzip.zip",
-                sha256="909a2a39c5e86bdd7662425e9b9c4a49bb582bf8d0edad427f3c3a9d0c6f7259",
-            )
-        ]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        (images_file, _), (labels_file, _) = self._files_and_checksums()
-        if path.name == images_file:
-            return 0
-        elif path.name == labels_file:
-            return 1
-        else:
-            return None
-
-    _categories = _emnist_info()["categories"]
-
-    _LABEL_OFFSETS = {
-        38: 1,
-        39: 1,
-        40: 1,
-        41: 1,
-        42: 1,
-        43: 6,
-        44: 8,
-        45: 8,
-        46: 9,
-    }
-
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
-        # In these two splits, some lowercase letters are merged into their uppercase ones (see Fig 2. in the paper).
-        # That means for example that there is 'D', 'd', and 'C', but not 'c'. Since the labels are nevertheless dense,
-        # i.e. no gaps between 0 and 46 for 47 total classes, we need to add an offset to create these gaps. For
-        # example, since there is no 'c', 'd' corresponds to
-        # label 38 (10 digits + 26 uppercase letters + 3rd unmerged lower case letter - 1 for zero indexing),
-        # and at the same time corresponds to
-        # index 39 (10 digits + 26 uppercase letters + 4th lower case letter - 1 for zero indexing)
-        # in self._categories. Thus, we need to add 1 to the label to correct this.
-        if self._image_set in ("Balanced", "By_Merge"):
-            image, label = data
-            label += self._LABEL_OFFSETS.get(int(label), 0)
-            data = (image, label)
-        return super()._prepare_sample(data)
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        images_dp, labels_dp = Demultiplexer(
-            archive_dp,
-            2,
-            self._classify_archive,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return super()._datapipe([images_dp, labels_dp])
-
-    def __len__(self) -> int:
-        return {
-            ("train", "Balanced"): 112_800,
-            ("train", "By_Merge"): 697_932,
-            ("train", "By_Class"): 697_932,
-            ("train", "Letters"): 124_800,
-            ("train", "Digits"): 240_000,
-            ("train", "MNIST"): 60_000,
-            ("test", "Balanced"): 18_800,
-            ("test", "By_Merge"): 116_323,
-            ("test", "By_Class"): 116_323,
-            ("test", "Letters"): 20_800,
-            ("test", "Digits"): 40_000,
-            ("test", "MNIST"): 10_000,
-        }[(self._split, self._image_set)]
-
-
-@register_info("qmnist")
-def _qmnist_info() -> Dict[str, Any]:
-    return dict(
-        categories=[str(label) for label in range(10)],
-    )
-
-
-@register_dataset("qmnist")
-class QMNIST(_MNISTBase):
-    """
-    - **homepage**: https://github.com/facebookresearch/qmnist
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "test", "test10k", "test50k", "nist"))
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL_BASE = "https://raw.githubusercontent.com/facebookresearch/qmnist/master"
-    _CHECKSUMS = {
-        "qmnist-train-images-idx3-ubyte.gz": "9e26a7bf1683614e065d7b76460ccd52807165b3f22561fb782bd9f38c52b51d",
-        "qmnist-train-labels-idx2-int.gz": "2c05dc77f6b916b38e455e97ab129a42a444f3dbef09b278a366f82904e0dd9f",
-        "qmnist-test-images-idx3-ubyte.gz": "43fc22bf7498b8fc98de98369d72f752d0deabc280a43a7bcc364ab19e57b375",
-        "qmnist-test-labels-idx2-int.gz": "9fbcbe594c3766fdf4f0b15c5165dc0d1e57ac604e01422608bb72c906030d06",
-        "xnist-images-idx3-ubyte.xz": "f075553993026d4359ded42208eff77a1941d3963c1eff49d6015814f15f0984",
-        "xnist-labels-idx2-int.xz": "db042968723ec2b7aed5f1beac25d2b6e983b9286d4f4bf725f1086e5ae55c4f",
-    }
-
-    def _files_and_checksums(self) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-        prefix = "xnist" if self._split == "nist" else f"qmnist-{'train' if self._split == 'train' else 'test'}"
-        suffix = "xz" if self._split == "nist" else "gz"
-        images_file = f"{prefix}-images-idx3-ubyte.{suffix}"
-        labels_file = f"{prefix}-labels-idx2-int.{suffix}"
-        return (images_file, self._CHECKSUMS[images_file]), (
-            labels_file,
-            self._CHECKSUMS[labels_file],
-        )
-
-    def start_and_stop(self) -> Tuple[Optional[int], Optional[int]]:
-        start: Optional[int]
-        stop: Optional[int]
-        if self._split == "test10k":
-            start = 0
-            stop = 10000
-        elif self._split == "test50k":
-            start = 10000
-            stop = None
-        else:
-            start = stop = None
-
-        return start, stop
-
-    _categories = _emnist_info()["categories"]
-
-    def _prepare_sample(self, data: Tuple[torch.Tensor, torch.Tensor]) -> Dict[str, Any]:
-        image, ann = data
-        label, *extra_anns = ann
-        sample = super()._prepare_sample((image, label))
-
-        sample.update(
-            dict(
-                zip(
-                    ("nist_hsf_series", "nist_writer_id", "digit_index", "nist_label", "global_digit_index"),
-                    [int(value) for value in extra_anns[:5]],
-                )
-            )
-        )
-        sample.update(dict(zip(("duplicate", "unused"), [bool(value) for value in extra_anns[-2:]])))
-        return sample
-
-    def __len__(self) -> int:
-        return {
-            "train": 60_000,
-            "test": 60_000,
-            "test10k": 10_000,
-            "test50k": 50_000,
-            "nist": 402_953,
-        }[self._split]
diff --git a/torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories b/torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories
deleted file mode 100644
index 36d29465b04..00000000000
--- a/torchvision/prototype/datasets/_builtin/oxford-iiit-pet.categories
+++ /dev/null
@@ -1,37 +0,0 @@
-Abyssinian
-American Bulldog
-American Pit Bull Terrier
-Basset Hound
-Beagle
-Bengal
-Birman
-Bombay
-Boxer
-British Shorthair
-Chihuahua
-Egyptian Mau
-English Cocker Spaniel
-English Setter
-German Shorthaired
-Great Pyrenees
-Havanese
-Japanese Chin
-Keeshond
-Leonberger
-Maine Coon
-Miniature Pinscher
-Newfoundland
-Persian
-Pomeranian
-Pug
-Ragdoll
-Russian Blue
-Saint Bernard
-Samoyed
-Scottish Terrier
-Shiba Inu
-Siamese
-Sphynx
-Staffordshire Bull Terrier
-Wheaten Terrier
-Yorkshire Terrier
diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
deleted file mode 100644
index 499dbd837ed..00000000000
--- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import enum
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.features import EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "oxford-iiit-pet"
-
-
-class OxfordIIITPetDemux(enum.IntEnum):
-    SPLIT_AND_CLASSIFICATION = 0
-    SEGMENTATIONS = 1
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class OxfordIIITPet(Dataset):
-    """Oxford IIIT Pet Dataset
-    homepage="https://www.robots.ox.ac.uk/~vgg/data/pets/",
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, split: str = "trainval", skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"trainval", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        images = HttpResource(
-            "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz",
-            sha256="67195c5e1c01f1ab5f9b6a5d22b8c27a580d896ece458917e61d459337fa318d",
-            preprocess="decompress",
-        )
-        anns = HttpResource(
-            "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz",
-            sha256="52425fb6de5c424942b7626b428656fcbd798db970a937df61750c0f1d358e91",
-            preprocess="decompress",
-        )
-        return [images, anns]
-
-    def _classify_anns(self, data: Tuple[str, Any]) -> Optional[int]:
-        return {
-            "annotations": OxfordIIITPetDemux.SPLIT_AND_CLASSIFICATION,
-            "trimaps": OxfordIIITPetDemux.SEGMENTATIONS,
-        }.get(pathlib.Path(data[0]).parent.name)
-
-    def _filter_images(self, data: Tuple[str, Any]) -> bool:
-        return pathlib.Path(data[0]).suffix == ".jpg"
-
-    def _filter_segmentations(self, data: Tuple[str, Any]) -> bool:
-        return not pathlib.Path(data[0]).name.startswith(".")
-
-    def _prepare_sample(
-        self, data: Tuple[Tuple[Dict[str, str], Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]
-    ) -> Dict[str, Any]:
-        ann_data, image_data = data
-        classification_data, segmentation_data = ann_data
-        segmentation_path, segmentation_buffer = segmentation_data
-        image_path, image_buffer = image_data
-
-        return dict(
-            label=Label(int(classification_data["label"]) - 1, categories=self._categories),
-            species="cat" if classification_data["species"] == "1" else "dog",
-            segmentation_path=segmentation_path,
-            segmentation=EncodedImage.from_file(segmentation_buffer),
-            image_path=image_path,
-            image=EncodedImage.from_file(image_buffer),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        images_dp, anns_dp = resource_dps
-
-        images_dp = Filter(images_dp, self._filter_images)
-
-        split_and_classification_dp, segmentations_dp = Demultiplexer(
-            anns_dp,
-            2,
-            self._classify_anns,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        split_and_classification_dp = Filter(split_and_classification_dp, path_comparator("name", f"{self._split}.txt"))
-        split_and_classification_dp = CSVDictParser(
-            split_and_classification_dp, fieldnames=("image_id", "label", "species"), delimiter=" "
-        )
-        split_and_classification_dp = hint_shuffling(split_and_classification_dp)
-        split_and_classification_dp = hint_sharding(split_and_classification_dp)
-
-        segmentations_dp = Filter(segmentations_dp, self._filter_segmentations)
-
-        anns_dp = IterKeyZipper(
-            split_and_classification_dp,
-            segmentations_dp,
-            key_fn=getitem("image_id"),
-            ref_key_fn=path_accessor("stem"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        dp = IterKeyZipper(
-            anns_dp,
-            images_dp,
-            key_fn=getitem(0, "image_id"),
-            ref_key_fn=path_accessor("stem"),
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-        return Mapper(dp, self._prepare_sample)
-
-    def _filter_split_and_classification_anns(self, data: Tuple[str, Any]) -> bool:
-        return self._classify_anns(data) == OxfordIIITPetDemux.SPLIT_AND_CLASSIFICATION
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        dp = resources[1].load(self._root)
-        dp = Filter(dp, self._filter_split_and_classification_anns)
-        dp = Filter(dp, path_comparator("name", "trainval.txt"))
-        dp = CSVDictParser(dp, fieldnames=("image_id", "label"), delimiter=" ")
-
-        raw_categories_and_labels = {(data["image_id"].rsplit("_", 1)[0], data["label"]) for data in dp}
-        raw_categories, _ = zip(
-            *sorted(raw_categories_and_labels, key=lambda raw_category_and_label: int(raw_category_and_label[1]))
-        )
-        return [" ".join(part.title() for part in raw_category.split("_")) for raw_category in raw_categories]
-
-    def __len__(self) -> int:
-        return 3_680 if self._split == "trainval" else 3_669
diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py
deleted file mode 100644
index 162f22f1abd..00000000000
--- a/torchvision/prototype/datasets/_builtin/pcam.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import io
-import pathlib
-from collections import namedtuple
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper
-from torchvision.prototype import features
-from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Label
-
-from .._api import register_dataset, register_info
-
-
-NAME = "pcam"
-
-
-class PCAMH5Reader(IterDataPipe[Tuple[str, io.IOBase]]):
-    def __init__(
-        self,
-        datapipe: IterDataPipe[Tuple[str, io.IOBase]],
-        key: Optional[str] = None,  # Note: this key thing might be very specific to the PCAM dataset
-    ) -> None:
-        self.datapipe = datapipe
-        self.key = key
-
-    def __iter__(self) -> Iterator[Tuple[str, io.IOBase]]:
-        import h5py
-
-        for _, handle in self.datapipe:
-            with h5py.File(handle) as data:
-                if self.key is not None:
-                    data = data[self.key]
-                yield from data
-
-
-_Resource = namedtuple("_Resource", ("file_name", "gdrive_id", "sha256"))
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=["0", "1"])
-
-
-@register_dataset(NAME)
-class PCAM(Dataset):
-    # TODO write proper docstring
-    """PCAM Dataset
-
-    homepage="https://github.com/basveeling/pcam"
-    """
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], split: str = "train", *, skip_integrity_check: bool = False
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "val", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check, dependencies=("h5py",))
-
-    _RESOURCES = {
-        "train": (
-            _Resource(  # Images
-                file_name="camelyonpatch_level_2_split_train_x.h5.gz",
-                gdrive_id="1Ka0XfEMiwgCYPdTI-vv6eUElOBnKFKQ2",
-                sha256="d619e741468a7ab35c7e4a75e6821b7e7e6c9411705d45708f2a0efc8960656c",
-            ),
-            _Resource(  # Targets
-                file_name="camelyonpatch_level_2_split_train_y.h5.gz",
-                gdrive_id="1269yhu3pZDP8UYFQs-NYs3FPwuK-nGSG",
-                sha256="b74126d2c01b20d3661f9b46765d29cf4e4fba6faba29c8e0d09d406331ab75a",
-            ),
-        ),
-        "test": (
-            _Resource(  # Images
-                file_name="camelyonpatch_level_2_split_test_x.h5.gz",
-                gdrive_id="1qV65ZqZvWzuIVthK8eVDhIwrbnsJdbg_",
-                sha256="79174c2201ad521602a5888be8f36ee10875f37403dd3f2086caf2182ef87245",
-            ),
-            _Resource(  # Targets
-                file_name="camelyonpatch_level_2_split_test_y.h5.gz",
-                gdrive_id="17BHrSrwWKjYsOgTMmoqrIjDy6Fa2o_gP",
-                sha256="0a522005fccc8bbd04c5a117bfaf81d8da2676f03a29d7499f71d0a0bd6068ef",
-            ),
-        ),
-        "val": (
-            _Resource(  # Images
-                file_name="camelyonpatch_level_2_split_valid_x.h5.gz",
-                gdrive_id="1hgshYGWK8V-eGRy8LToWJJgDU_rXWVJ3",
-                sha256="f82ee1670d027b4ec388048d9eabc2186b77c009655dae76d624c0ecb053ccb2",
-            ),
-            _Resource(  # Targets
-                file_name="camelyonpatch_level_2_split_valid_y.h5.gz",
-                gdrive_id="1bH8ZRbhSVAhScTS0p9-ZzGnX91cHT3uO",
-                sha256="ce1ae30f08feb468447971cfd0472e7becd0ad96d877c64120c72571439ae48c",
-            ),
-        ),
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        return [  # = [images resource, targets resource]
-            GDriveResource(file_name=file_name, id=gdrive_id, sha256=sha256, preprocess="decompress")
-            for file_name, gdrive_id, sha256 in self._RESOURCES[self._split]
-        ]
-
-    def _prepare_sample(self, data: Tuple[Any, Any]) -> Dict[str, Any]:
-        image, target = data  # They're both numpy arrays at this point
-
-        return {
-            "image": features.Image(image.transpose(2, 0, 1)),
-            "label": Label(target.item(), categories=self._categories),
-        }
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-
-        images_dp, targets_dp = resource_dps
-
-        images_dp = PCAMH5Reader(images_dp, key="x")
-        targets_dp = PCAMH5Reader(targets_dp, key="y")
-
-        dp = Zipper(images_dp, targets_dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 262_144 if self._split == "train" else 32_768
diff --git a/torchvision/prototype/datasets/_builtin/sbd.categories b/torchvision/prototype/datasets/_builtin/sbd.categories
deleted file mode 100644
index 8420ab35ede..00000000000
--- a/torchvision/prototype/datasets/_builtin/sbd.categories
+++ /dev/null
@@ -1,20 +0,0 @@
-aeroplane
-bicycle
-bird
-boat
-bottle
-bus
-car
-cat
-chair
-cow
-diningtable
-dog
-horse
-motorbike
-person
-pottedplant
-sheep
-sofa
-train
-tvmonitor
diff --git a/torchvision/prototype/datasets/_builtin/sbd.py b/torchvision/prototype/datasets/_builtin/sbd.py
deleted file mode 100644
index c7a79c4188e..00000000000
--- a/torchvision/prototype/datasets/_builtin/sbd.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import pathlib
-import re
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.features import _Feature, EncodedImage
-
-from .._api import register_dataset, register_info
-
-NAME = "sbd"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class SBD(Dataset):
-    """
-    - **homepage**: http://home.bharathh.info/pubs/codes/SBD/download.html
-    - **dependencies**:
-        - <scipy `https://scipy.org`>_
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", ("train", "val", "train_noval"))
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, dependencies=("scipy",), skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        archive = HttpResource(
-            "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz",
-            sha256="6a5a2918d5c73ce032fdeba876574d150d9d04113ab87540a1304cbcc715be53",
-        )
-        extra_split = HttpResource(
-            "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt",
-            sha256="0b2068f7a359d2907431803e1cd63bf6162da37d7d503b589d3b08c6fd0c2432",
-        )
-        return [archive, extra_split]
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        path = pathlib.Path(data[0])
-        parent, grandparent, *_ = path.parents
-
-        if parent.name == "dataset":
-            return 0
-        elif grandparent.name == "dataset":
-            if parent.name == "img":
-                return 1
-            elif parent.name == "cls":
-                return 2
-            else:
-                return None
-        else:
-            return None
-
-    def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[str, BinaryIO]]) -> Dict[str, Any]:
-        split_and_image_data, ann_data = data
-        _, image_data = split_and_image_data
-        image_path, image_buffer = image_data
-        ann_path, ann_buffer = ann_data
-
-        anns = read_mat(ann_buffer, squeeze_me=True)["GTcls"]
-
-        return dict(
-            image_path=image_path,
-            image=EncodedImage.from_file(image_buffer),
-            ann_path=ann_path,
-            # the boundaries are stored in sparse CSC format, which is not supported by PyTorch
-            boundaries=_Feature(np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])),
-            segmentation=_Feature(anns["Segmentation"].item()),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp, extra_split_dp = resource_dps
-
-        archive_dp = resource_dps[0]
-        split_dp, images_dp, anns_dp = Demultiplexer(
-            archive_dp,
-            3,
-            self._classify_archive,
-            buffer_size=INFINITE_BUFFER_SIZE,
-            drop_none=True,
-        )
-        if self._split == "train_noval":
-            split_dp = extra_split_dp
-
-        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-        split_dp = LineReader(split_dp, decode=True)
-        split_dp = hint_shuffling(split_dp)
-        split_dp = hint_sharding(split_dp)
-
-        dp = split_dp
-        for level, data_dp in enumerate((images_dp, anns_dp)):
-            dp = IterKeyZipper(
-                dp,
-                data_dp,
-                key_fn=getitem(*[0] * level, 1),
-                ref_key_fn=path_accessor("stem"),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 8_498,
-            "val": 2_857,
-            "train_noval": 5_623,
-        }[self._split]
-
-    def _generate_categories(self) -> Tuple[str, ...]:
-        resources = self._resources()
-
-        dp = resources[0].load(self._root)
-        dp = Filter(dp, path_comparator("name", "category_names.m"))
-        dp = LineReader(dp)
-        dp = Mapper(dp, bytes.decode, input_col=1)
-        lines = tuple(zip(*iter(dp)))[1]
-
-        pattern = re.compile(r"\s*'(?P<category>\w+)';\s*%(?P<label>\d+)")
-        categories_and_labels = cast(
-            List[Tuple[str, ...]],
-            [
-                pattern.match(line).groups()  # type: ignore[union-attr]
-                # the first and last line contain no information
-                for line in lines[1:-1]
-            ],
-        )
-        categories_and_labels.sort(key=lambda category_and_label: int(category_and_label[1]))
-        categories, _ = zip(*categories_and_labels)
-
-        return categories
diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py
deleted file mode 100644
index 8107f6565e4..00000000000
--- a/torchvision/prototype/datasets/_builtin/semeion.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Tuple, Union
-
-import torch
-from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, OneHotLabel
-
-from .._api import register_dataset, register_info
-
-NAME = "semeion"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=[str(i) for i in range(10)])
-
-
-@register_dataset(NAME)
-class SEMEION(Dataset):
-    """Semeion dataset
-    homepage="https://archive.ics.uci.edu/ml/datasets/Semeion+Handwritten+Digit",
-    """
-
-    def __init__(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False) -> None:
-
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    def _resources(self) -> List[OnlineResource]:
-        data = HttpResource(
-            "http://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",
-            sha256="f43228ae3da5ea6a3c95069d53450b86166770e3b719dcc333182128fe08d4b1",
-        )
-        return [data]
-
-    def _prepare_sample(self, data: Tuple[str, ...]) -> Dict[str, Any]:
-        image_data, label_data = data[:256], data[256:-1]
-
-        return dict(
-            image=Image(torch.tensor([float(pixel) for pixel in image_data], dtype=torch.float).reshape(16, 16)),
-            label=OneHotLabel([int(label) for label in label_data], categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = CSVParser(dp, delimiter=" ")
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 1_593
diff --git a/torchvision/prototype/datasets/_builtin/stanford-cars.categories b/torchvision/prototype/datasets/_builtin/stanford-cars.categories
deleted file mode 100644
index e54040f7779..00000000000
--- a/torchvision/prototype/datasets/_builtin/stanford-cars.categories
+++ /dev/null
@@ -1,196 +0,0 @@
-AM General Hummer SUV 2000
-Acura RL Sedan 2012
-Acura TL Sedan 2012
-Acura TL Type-S 2008
-Acura TSX Sedan 2012
-Acura Integra Type R 2001
-Acura ZDX Hatchback 2012
-Aston Martin V8 Vantage Convertible 2012
-Aston Martin V8 Vantage Coupe 2012
-Aston Martin Virage Convertible 2012
-Aston Martin Virage Coupe 2012
-Audi RS 4 Convertible 2008
-Audi A5 Coupe 2012
-Audi TTS Coupe 2012
-Audi R8 Coupe 2012
-Audi V8 Sedan 1994
-Audi 100 Sedan 1994
-Audi 100 Wagon 1994
-Audi TT Hatchback 2011
-Audi S6 Sedan 2011
-Audi S5 Convertible 2012
-Audi S5 Coupe 2012
-Audi S4 Sedan 2012
-Audi S4 Sedan 2007
-Audi TT RS Coupe 2012
-BMW ActiveHybrid 5 Sedan 2012
-BMW 1 Series Convertible 2012
-BMW 1 Series Coupe 2012
-BMW 3 Series Sedan 2012
-BMW 3 Series Wagon 2012
-BMW 6 Series Convertible 2007
-BMW X5 SUV 2007
-BMW X6 SUV 2012
-BMW M3 Coupe 2012
-BMW M5 Sedan 2010
-BMW M6 Convertible 2010
-BMW X3 SUV 2012
-BMW Z4 Convertible 2012
-Bentley Continental Supersports Conv. Convertible 2012
-Bentley Arnage Sedan 2009
-Bentley Mulsanne Sedan 2011
-Bentley Continental GT Coupe 2012
-Bentley Continental GT Coupe 2007
-Bentley Continental Flying Spur Sedan 2007
-Bugatti Veyron 16.4 Convertible 2009
-Bugatti Veyron 16.4 Coupe 2009
-Buick Regal GS 2012
-Buick Rainier SUV 2007
-Buick Verano Sedan 2012
-Buick Enclave SUV 2012
-Cadillac CTS-V Sedan 2012
-Cadillac SRX SUV 2012
-Cadillac Escalade EXT Crew Cab 2007
-Chevrolet Silverado 1500 Hybrid Crew Cab 2012
-Chevrolet Corvette Convertible 2012
-Chevrolet Corvette ZR1 2012
-Chevrolet Corvette Ron Fellows Edition Z06 2007
-Chevrolet Traverse SUV 2012
-Chevrolet Camaro Convertible 2012
-Chevrolet HHR SS 2010
-Chevrolet Impala Sedan 2007
-Chevrolet Tahoe Hybrid SUV 2012
-Chevrolet Sonic Sedan 2012
-Chevrolet Express Cargo Van 2007
-Chevrolet Avalanche Crew Cab 2012
-Chevrolet Cobalt SS 2010
-Chevrolet Malibu Hybrid Sedan 2010
-Chevrolet TrailBlazer SS 2009
-Chevrolet Silverado 2500HD Regular Cab 2012
-Chevrolet Silverado 1500 Classic Extended Cab 2007
-Chevrolet Express Van 2007
-Chevrolet Monte Carlo Coupe 2007
-Chevrolet Malibu Sedan 2007
-Chevrolet Silverado 1500 Extended Cab 2012
-Chevrolet Silverado 1500 Regular Cab 2012
-Chrysler Aspen SUV 2009
-Chrysler Sebring Convertible 2010
-Chrysler Town and Country Minivan 2012
-Chrysler 300 SRT-8 2010
-Chrysler Crossfire Convertible 2008
-Chrysler PT Cruiser Convertible 2008
-Daewoo Nubira Wagon 2002
-Dodge Caliber Wagon 2012
-Dodge Caliber Wagon 2007
-Dodge Caravan Minivan 1997
-Dodge Ram Pickup 3500 Crew Cab 2010
-Dodge Ram Pickup 3500 Quad Cab 2009
-Dodge Sprinter Cargo Van 2009
-Dodge Journey SUV 2012
-Dodge Dakota Crew Cab 2010
-Dodge Dakota Club Cab 2007
-Dodge Magnum Wagon 2008
-Dodge Challenger SRT8 2011
-Dodge Durango SUV 2012
-Dodge Durango SUV 2007
-Dodge Charger Sedan 2012
-Dodge Charger SRT-8 2009
-Eagle Talon Hatchback 1998
-FIAT 500 Abarth 2012
-FIAT 500 Convertible 2012
-Ferrari FF Coupe 2012
-Ferrari California Convertible 2012
-Ferrari 458 Italia Convertible 2012
-Ferrari 458 Italia Coupe 2012
-Fisker Karma Sedan 2012
-Ford F-450 Super Duty Crew Cab 2012
-Ford Mustang Convertible 2007
-Ford Freestar Minivan 2007
-Ford Expedition EL SUV 2009
-Ford Edge SUV 2012
-Ford Ranger SuperCab 2011
-Ford GT Coupe 2006
-Ford F-150 Regular Cab 2012
-Ford F-150 Regular Cab 2007
-Ford Focus Sedan 2007
-Ford E-Series Wagon Van 2012
-Ford Fiesta Sedan 2012
-GMC Terrain SUV 2012
-GMC Savana Van 2012
-GMC Yukon Hybrid SUV 2012
-GMC Acadia SUV 2012
-GMC Canyon Extended Cab 2012
-Geo Metro Convertible 1993
-HUMMER H3T Crew Cab 2010
-HUMMER H2 SUT Crew Cab 2009
-Honda Odyssey Minivan 2012
-Honda Odyssey Minivan 2007
-Honda Accord Coupe 2012
-Honda Accord Sedan 2012
-Hyundai Veloster Hatchback 2012
-Hyundai Santa Fe SUV 2012
-Hyundai Tucson SUV 2012
-Hyundai Veracruz SUV 2012
-Hyundai Sonata Hybrid Sedan 2012
-Hyundai Elantra Sedan 2007
-Hyundai Accent Sedan 2012
-Hyundai Genesis Sedan 2012
-Hyundai Sonata Sedan 2012
-Hyundai Elantra Touring Hatchback 2012
-Hyundai Azera Sedan 2012
-Infiniti G Coupe IPL 2012
-Infiniti QX56 SUV 2011
-Isuzu Ascender SUV 2008
-Jaguar XK XKR 2012
-Jeep Patriot SUV 2012
-Jeep Wrangler SUV 2012
-Jeep Liberty SUV 2012
-Jeep Grand Cherokee SUV 2012
-Jeep Compass SUV 2012
-Lamborghini Reventon Coupe 2008
-Lamborghini Aventador Coupe 2012
-Lamborghini Gallardo LP 570-4 Superleggera 2012
-Lamborghini Diablo Coupe 2001
-Land Rover Range Rover SUV 2012
-Land Rover LR2 SUV 2012
-Lincoln Town Car Sedan 2011
-MINI Cooper Roadster Convertible 2012
-Maybach Landaulet Convertible 2012
-Mazda Tribute SUV 2011
-McLaren MP4-12C Coupe 2012
-Mercedes-Benz 300-Class Convertible 1993
-Mercedes-Benz C-Class Sedan 2012
-Mercedes-Benz SL-Class Coupe 2009
-Mercedes-Benz E-Class Sedan 2012
-Mercedes-Benz S-Class Sedan 2012
-Mercedes-Benz Sprinter Van 2012
-Mitsubishi Lancer Sedan 2012
-Nissan Leaf Hatchback 2012
-Nissan NV Passenger Van 2012
-Nissan Juke Hatchback 2012
-Nissan 240SX Coupe 1998
-Plymouth Neon Coupe 1999
-Porsche Panamera Sedan 2012
-Ram C/V Cargo Van Minivan 2012
-Rolls-Royce Phantom Drophead Coupe Convertible 2012
-Rolls-Royce Ghost Sedan 2012
-Rolls-Royce Phantom Sedan 2012
-Scion xD Hatchback 2012
-Spyker C8 Convertible 2009
-Spyker C8 Coupe 2009
-Suzuki Aerio Sedan 2007
-Suzuki Kizashi Sedan 2012
-Suzuki SX4 Hatchback 2012
-Suzuki SX4 Sedan 2012
-Tesla Model S Sedan 2012
-Toyota Sequoia SUV 2012
-Toyota Camry Sedan 2012
-Toyota Corolla Sedan 2012
-Toyota 4Runner SUV 2012
-Volkswagen Golf Hatchback 2012
-Volkswagen Golf Hatchback 1991
-Volkswagen Beetle Hatchback 2012
-Volvo C30 Hatchback 2012
-Volvo 240 Sedan 1993
-Volvo XC90 SUV 2007
-smart fortwo Convertible 2012
diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py
deleted file mode 100644
index 011204f2bfb..00000000000
--- a/torchvision/prototype/datasets/_builtin/stanford_cars.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import pathlib
-from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
-
-from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    hint_sharding,
-    hint_shuffling,
-    path_comparator,
-    read_categories_file,
-    read_mat,
-)
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-
-class StanfordCarsLabelReader(IterDataPipe[Tuple[int, int, int, int, int, str]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[str, Any]]) -> None:
-        self.datapipe = datapipe
-
-    def __iter__(self) -> Iterator[Tuple[int, int, int, int, int, str]]:
-        for _, file in self.datapipe:
-            data = read_mat(file, squeeze_me=True)
-            for ann in data["annotations"]:
-                yield tuple(ann)  # type: ignore[misc]
-
-
-NAME = "stanford-cars"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class StanfordCars(Dataset):
-    """Stanford Cars dataset.
-    homepage="https://ai.stanford.edu/~jkrause/cars/car_dataset.html",
-    dependencies=scipy
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check, dependencies=("scipy",))
-
-    _URL_ROOT = "https://ai.stanford.edu/~jkrause/"
-    _URLS = {
-        "train": f"{_URL_ROOT}car196/cars_train.tgz",
-        "test": f"{_URL_ROOT}car196/cars_test.tgz",
-        "cars_test_annos_withlabels": f"{_URL_ROOT}car196/cars_test_annos_withlabels.mat",
-        "car_devkit": f"{_URL_ROOT}cars/car_devkit.tgz",
-    }
-
-    _CHECKSUM = {
-        "train": "b97deb463af7d58b6bfaa18b2a4de9829f0f79e8ce663dfa9261bf7810e9accd",
-        "test": "bffea656d6f425cba3c91c6d83336e4c5f86c6cffd8975b0f375d3a10da8e243",
-        "cars_test_annos_withlabels": "790f75be8ea34eeded134cc559332baf23e30e91367e9ddca97d26ed9b895f05",
-        "car_devkit": "512b227b30e2f0a8aab9e09485786ab4479582073a144998da74d64b801fd288",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        resources: List[OnlineResource] = [HttpResource(self._URLS[self._split], sha256=self._CHECKSUM[self._split])]
-        if self._split == "train":
-            resources.append(HttpResource(url=self._URLS["car_devkit"], sha256=self._CHECKSUM["car_devkit"]))
-
-        else:
-            resources.append(
-                HttpResource(
-                    self._URLS["cars_test_annos_withlabels"], sha256=self._CHECKSUM["cars_test_annos_withlabels"]
-                )
-            )
-        return resources
-
-    def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Tuple[int, int, int, int, int, str]]) -> Dict[str, Any]:
-        image, target = data
-        path, buffer = image
-        image = EncodedImage.from_file(buffer)
-
-        return dict(
-            path=path,
-            image=image,
-            label=Label(target[4] - 1, categories=self._categories),
-            bounding_box=BoundingBox(target[:4], format="xyxy", image_size=image.image_size),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-
-        images_dp, targets_dp = resource_dps
-        if self._split == "train":
-            targets_dp = Filter(targets_dp, path_comparator("name", "cars_train_annos.mat"))
-        targets_dp = StanfordCarsLabelReader(targets_dp)
-        dp = Zipper(images_dp, targets_dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def _generate_categories(self) -> List[str]:
-        resources = self._resources()
-
-        devkit_dp = resources[1].load(self._root)
-        meta_dp = Filter(devkit_dp, path_comparator("name", "cars_meta.mat"))
-        _, meta_file = next(iter(meta_dp))
-
-        return list(read_mat(meta_file, squeeze_me=True)["class_names"])
-
-    def __len__(self) -> int:
-        return 8_144 if self._split == "train" else 8_041
diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py
deleted file mode 100644
index 6dd55a77c99..00000000000
--- a/torchvision/prototype/datasets/_builtin/svhn.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import pathlib
-from typing import Any, BinaryIO, Dict, List, Tuple, Union
-
-import numpy as np
-from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat
-from torchvision.prototype.features import Image, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "svhn"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=[str(c) for c in range(10)])
-
-
-@register_dataset(NAME)
-class SVHN(Dataset):
-    """SVHN Dataset.
-    homepage="http://ufldl.stanford.edu/housenumbers/",
-    dependencies = scipy
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test", "extra"})
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check, dependencies=("scipy",))
-
-    _CHECKSUMS = {
-        "train": "435e94d69a87fde4fd4d7f3dd208dfc32cb6ae8af2240d066de1df7508d083b8",
-        "test": "cdce80dfb2a2c4c6160906d0bd7c68ec5a99d7ca4831afa54f09182025b6a75b",
-        "extra": "a133a4beb38a00fcdda90c9489e0c04f900b660ce8a316a5e854838379a71eb3",
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        data = HttpResource(
-            f"http://ufldl.stanford.edu/housenumbers/{self._split}_32x32.mat",
-            sha256=self._CHECKSUMS[self._split],
-        )
-
-        return [data]
-
-    def _read_images_and_labels(self, data: Tuple[str, BinaryIO]) -> List[Tuple[np.ndarray, np.ndarray]]:
-        _, buffer = data
-        content = read_mat(buffer)
-        return list(
-            zip(
-                content["X"].transpose((3, 0, 1, 2)),
-                content["y"].squeeze(),
-            )
-        )
-
-    def _prepare_sample(self, data: Tuple[np.ndarray, np.ndarray]) -> Dict[str, Any]:
-        image_array, label_array = data
-
-        return dict(
-            image=Image(image_array.transpose((2, 0, 1))),
-            label=Label(int(label_array) % 10, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = resource_dps[0]
-        dp = Mapper(dp, self._read_images_and_labels)
-        dp = UnBatcher(dp)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            "train": 73_257,
-            "test": 26_032,
-            "extra": 531_131,
-        }[self._split]
diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py
deleted file mode 100644
index e5ca58f8428..00000000000
--- a/torchvision/prototype/datasets/_builtin/usps.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import pathlib
-from typing import Any, Dict, List, Union
-
-import torch
-from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import Image, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "usps"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=[str(c) for c in range(10)])
-
-
-@register_dataset(NAME)
-class USPS(Dataset):
-    """USPS Dataset
-    homepage="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#usps",
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._split = self._verify_str_arg(split, "split", {"train", "test"})
-
-        self._categories = _info()["categories"]
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass"
-
-    _RESOURCES = {
-        "train": HttpResource(
-            f"{_URL}/usps.bz2", sha256="3771e9dd6ba685185f89867b6e249233dd74652389f263963b3b741e994b034f"
-        ),
-        "test": HttpResource(
-            f"{_URL}/usps.t.bz2", sha256="a9c0164e797d60142a50604917f0baa604f326e9a689698763793fa5d12ffc4e"
-        ),
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        return [USPS._RESOURCES[self._split]]
-
-    def _prepare_sample(self, line: str) -> Dict[str, Any]:
-        label, *values = line.strip().split(" ")
-        values = [float(value.split(":")[1]) for value in values]
-        pixels = torch.tensor(values).add_(1).div_(2)
-        return dict(
-            image=Image(pixels.reshape(16, 16)),
-            label=Label(int(label) - 1, categories=self._categories),
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        dp = Decompressor(resource_dps[0])
-        dp = LineReader(dp, decode=True, return_path=False)
-        dp = hint_shuffling(dp)
-        dp = hint_sharding(dp)
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return 7_291 if self._split == "train" else 2_007
diff --git a/torchvision/prototype/datasets/_builtin/voc.categories b/torchvision/prototype/datasets/_builtin/voc.categories
deleted file mode 100644
index febc0012ab3..00000000000
--- a/torchvision/prototype/datasets/_builtin/voc.categories
+++ /dev/null
@@ -1,21 +0,0 @@
-__background__
-aeroplane
-bicycle
-bird
-boat
-bottle
-bus
-car
-cat
-chair
-cow
-diningtable
-dog
-horse
-motorbike
-person
-pottedplant
-sheep
-sofa
-train
-tvmonitor
diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py
deleted file mode 100644
index 2f13ce10d6f..00000000000
--- a/torchvision/prototype/datasets/_builtin/voc.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import enum
-import functools
-import pathlib
-from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
-from xml.etree import ElementTree
-
-from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
-from torchvision.datasets import VOCDetection
-from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource
-from torchvision.prototype.datasets.utils._internal import (
-    getitem,
-    hint_sharding,
-    hint_shuffling,
-    INFINITE_BUFFER_SIZE,
-    path_accessor,
-    path_comparator,
-    read_categories_file,
-)
-from torchvision.prototype.features import BoundingBox, EncodedImage, Label
-
-from .._api import register_dataset, register_info
-
-NAME = "voc"
-
-
-@register_info(NAME)
-def _info() -> Dict[str, Any]:
-    return dict(categories=read_categories_file(NAME))
-
-
-@register_dataset(NAME)
-class VOC(Dataset):
-    """
-    - **homepage**: http://host.robots.ox.ac.uk/pascal/VOC/
-    """
-
-    def __init__(
-        self,
-        root: Union[str, pathlib.Path],
-        *,
-        split: str = "train",
-        year: str = "2012",
-        task: str = "detection",
-        skip_integrity_check: bool = False,
-    ) -> None:
-        self._year = self._verify_str_arg(year, "year", ("2007", "2008", "2009", "2010", "2011", "2012"))
-        if split == "test" and year != "2007":
-            raise ValueError("`split='test'` is only available for `year='2007'`")
-        else:
-            self._split = self._verify_str_arg(split, "split", ("train", "val", "trainval", "test"))
-        self._task = self._verify_str_arg(task, "task", ("detection", "segmentation"))
-
-        self._anns_folder = "Annotations" if task == "detection" else "SegmentationClass"
-        self._split_folder = "Main" if task == "detection" else "Segmentation"
-
-        self._categories = _info()["categories"]
-
-        super().__init__(root, skip_integrity_check=skip_integrity_check)
-
-    _TRAIN_VAL_ARCHIVES = {
-        "2007": ("VOCtrainval_06-Nov-2007.tar", "7d8cd951101b0957ddfd7a530bdc8a94f06121cfc1e511bb5937e973020c7508"),
-        "2008": ("VOCtrainval_14-Jul-2008.tar", "7f0ca53c1b5a838fbe946965fc106c6e86832183240af5c88e3f6c306318d42e"),
-        "2009": ("VOCtrainval_11-May-2009.tar", "11cbe1741fb5bdadbbca3c08e9ec62cd95c14884845527d50847bc2cf57e7fd6"),
-        "2010": ("VOCtrainval_03-May-2010.tar", "1af4189cbe44323ab212bff7afbc7d0f55a267cc191eb3aac911037887e5c7d4"),
-        "2011": ("VOCtrainval_25-May-2011.tar", "0a7f5f5d154f7290ec65ec3f78b72ef72c6d93ff6d79acd40dc222a9ee5248ba"),
-        "2012": ("VOCtrainval_11-May-2012.tar", "e14f763270cf193d0b5f74b169f44157a4b0c6efa708f4dd0ff78ee691763bcb"),
-    }
-    _TEST_ARCHIVES = {
-        "2007": ("VOCtest_06-Nov-2007.tar", "6836888e2e01dca84577a849d339fa4f73e1e4f135d312430c4856b5609b4892")
-    }
-
-    def _resources(self) -> List[OnlineResource]:
-        file_name, sha256 = (self._TEST_ARCHIVES if self._split == "test" else self._TRAIN_VAL_ARCHIVES)[self._year]
-        archive = HttpResource(f"http://host.robots.ox.ac.uk/pascal/VOC/voc{self._year}/{file_name}", sha256=sha256)
-        return [archive]
-
-    def _is_in_folder(self, data: Tuple[str, Any], *, name: str, depth: int = 1) -> bool:
-        path = pathlib.Path(data[0])
-        return name in path.parent.parts[-depth:]
-
-    class _Demux(enum.IntEnum):
-        SPLIT = 0
-        IMAGES = 1
-        ANNS = 2
-
-    def _classify_archive(self, data: Tuple[str, Any]) -> Optional[int]:
-        if self._is_in_folder(data, name="ImageSets", depth=2):
-            return self._Demux.SPLIT
-        elif self._is_in_folder(data, name="JPEGImages"):
-            return self._Demux.IMAGES
-        elif self._is_in_folder(data, name=self._anns_folder):
-            return self._Demux.ANNS
-        else:
-            return None
-
-    def _parse_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        return cast(Dict[str, Any], VOCDetection.parse_voc_xml(ElementTree.parse(buffer).getroot())["annotation"])
-
-    def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        anns = self._parse_detection_ann(buffer)
-        instances = anns["object"]
-        return dict(
-            bounding_boxes=BoundingBox(
-                [
-                    [int(instance["bndbox"][part]) for part in ("xmin", "ymin", "xmax", "ymax")]
-                    for instance in instances
-                ],
-                format="xyxy",
-                image_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))),
-            ),
-            labels=Label(
-                [self._categories.index(instance["name"]) for instance in instances], categories=self._categories
-            ),
-        )
-
-    def _prepare_segmentation_ann(self, buffer: BinaryIO) -> Dict[str, Any]:
-        return dict(segmentation=EncodedImage.from_file(buffer))
-
-    def _prepare_sample(
-        self,
-        data: Tuple[Tuple[Tuple[str, str], Tuple[str, BinaryIO]], Tuple[str, BinaryIO]],
-    ) -> Dict[str, Any]:
-        split_and_image_data, ann_data = data
-        _, image_data = split_and_image_data
-        image_path, image_buffer = image_data
-        ann_path, ann_buffer = ann_data
-
-        return dict(
-            (self._prepare_detection_ann if self._task == "detection" else self._prepare_segmentation_ann)(ann_buffer),
-            image_path=image_path,
-            image=EncodedImage.from_file(image_buffer),
-            ann_path=ann_path,
-        )
-
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        archive_dp = resource_dps[0]
-        split_dp, images_dp, anns_dp = Demultiplexer(
-            archive_dp,
-            3,
-            self._classify_archive,
-            drop_none=True,
-            buffer_size=INFINITE_BUFFER_SIZE,
-        )
-
-        split_dp = Filter(split_dp, functools.partial(self._is_in_folder, name=self._split_folder))
-        split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt"))
-        split_dp = LineReader(split_dp, decode=True)
-        split_dp = hint_shuffling(split_dp)
-        split_dp = hint_sharding(split_dp)
-
-        dp = split_dp
-        for level, data_dp in enumerate((images_dp, anns_dp)):
-            dp = IterKeyZipper(
-                dp,
-                data_dp,
-                key_fn=getitem(*[0] * level, 1),
-                ref_key_fn=path_accessor("stem"),
-                buffer_size=INFINITE_BUFFER_SIZE,
-            )
-        return Mapper(dp, self._prepare_sample)
-
-    def __len__(self) -> int:
-        return {
-            ("train", "2007", "detection"): 2_501,
-            ("train", "2007", "segmentation"): 209,
-            ("train", "2008", "detection"): 2_111,
-            ("train", "2008", "segmentation"): 511,
-            ("train", "2009", "detection"): 3_473,
-            ("train", "2009", "segmentation"): 749,
-            ("train", "2010", "detection"): 4_998,
-            ("train", "2010", "segmentation"): 964,
-            ("train", "2011", "detection"): 5_717,
-            ("train", "2011", "segmentation"): 1_112,
-            ("train", "2012", "detection"): 5_717,
-            ("train", "2012", "segmentation"): 1_464,
-            ("val", "2007", "detection"): 2_510,
-            ("val", "2007", "segmentation"): 213,
-            ("val", "2008", "detection"): 2_221,
-            ("val", "2008", "segmentation"): 512,
-            ("val", "2009", "detection"): 3_581,
-            ("val", "2009", "segmentation"): 750,
-            ("val", "2010", "detection"): 5_105,
-            ("val", "2010", "segmentation"): 964,
-            ("val", "2011", "detection"): 5_823,
-            ("val", "2011", "segmentation"): 1_111,
-            ("val", "2012", "detection"): 5_823,
-            ("val", "2012", "segmentation"): 1_449,
-            ("trainval", "2007", "detection"): 5_011,
-            ("trainval", "2007", "segmentation"): 422,
-            ("trainval", "2008", "detection"): 4_332,
-            ("trainval", "2008", "segmentation"): 1_023,
-            ("trainval", "2009", "detection"): 7_054,
-            ("trainval", "2009", "segmentation"): 1_499,
-            ("trainval", "2010", "detection"): 10_103,
-            ("trainval", "2010", "segmentation"): 1_928,
-            ("trainval", "2011", "detection"): 11_540,
-            ("trainval", "2011", "segmentation"): 2_223,
-            ("trainval", "2012", "detection"): 11_540,
-            ("trainval", "2012", "segmentation"): 2_913,
-            ("test", "2007", "detection"): 4_952,
-            ("test", "2007", "segmentation"): 210,
-        }[(self._split, self._year, self._task)]
-
-    def _filter_anns(self, data: Tuple[str, Any]) -> bool:
-        return self._classify_archive(data) == self._Demux.ANNS
-
-    def _generate_categories(self) -> List[str]:
-        self._task = "detection"
-        resources = self._resources()
-
-        archive_dp = resources[0].load(self._root)
-        dp = Filter(archive_dp, self._filter_anns)
-        dp = Mapper(dp, self._parse_detection_ann, input_col=1)
-
-        categories = sorted({instance["name"] for _, anns in dp for instance in anns["object"]})
-        # We add a background category to be used during segmentation
-        categories.insert(0, "__background__")
-
-        return categories
diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py
deleted file mode 100644
index b2ec23c5e3d..00000000000
--- a/torchvision/prototype/datasets/_folder.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import functools
-import os
-import os.path
-import pathlib
-from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union
-
-from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper
-from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling
-from torchvision.prototype.features import EncodedData, EncodedImage, Label
-
-
-__all__ = ["from_data_folder", "from_image_folder"]
-
-
-def _is_not_top_level_file(path: str, *, root: pathlib.Path) -> bool:
-    rel_path = pathlib.Path(path).relative_to(root)
-    return rel_path.is_dir() or rel_path.parent != pathlib.Path(".")
-
-
-def _prepare_sample(
-    data: Tuple[str, BinaryIO],
-    *,
-    root: pathlib.Path,
-    categories: List[str],
-) -> Dict[str, Any]:
-    path, buffer = data
-    category = pathlib.Path(path).relative_to(root).parts[0]
-    return dict(
-        path=path,
-        data=EncodedData.from_file(buffer),
-        label=Label.from_category(category, categories=categories),
-    )
-
-
-def from_data_folder(
-    root: Union[str, pathlib.Path],
-    *,
-    valid_extensions: Optional[Collection[str]] = None,
-    recursive: bool = True,
-) -> Tuple[IterDataPipe, List[str]]:
-    root = pathlib.Path(root).expanduser().resolve()
-    categories = sorted(entry.name for entry in os.scandir(root) if entry.is_dir())
-    masks: Union[List[str], str] = [f"*.{ext}" for ext in valid_extensions] if valid_extensions is not None else ""
-    dp = FileLister(str(root), recursive=recursive, masks=masks)
-    dp: IterDataPipe = Filter(dp, functools.partial(_is_not_top_level_file, root=root))
-    dp = hint_sharding(dp)
-    dp = hint_shuffling(dp)
-    dp = FileOpener(dp, mode="rb")
-    return Mapper(dp, functools.partial(_prepare_sample, root=root, categories=categories)), categories
-
-
-def _data_to_image_key(sample: Dict[str, Any]) -> Dict[str, Any]:
-    sample["image"] = EncodedImage(sample.pop("data").data)
-    return sample
-
-
-def from_image_folder(
-    root: Union[str, pathlib.Path],
-    *,
-    valid_extensions: Collection[str] = ("jpg", "jpeg", "png", "ppm", "bmp", "pgm", "tif", "tiff", "webp"),
-    **kwargs: Any,
-) -> Tuple[IterDataPipe, List[str]]:
-    valid_extensions = [valid_extension for ext in valid_extensions for valid_extension in (ext.lower(), ext.upper())]
-    dp, categories = from_data_folder(root, valid_extensions=valid_extensions, **kwargs)
-    return Mapper(dp, _data_to_image_key), categories
diff --git a/torchvision/prototype/datasets/_home.py b/torchvision/prototype/datasets/_home.py
deleted file mode 100644
index e5a89c4bdf3..00000000000
--- a/torchvision/prototype/datasets/_home.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-from typing import Optional
-
-import torchvision._internally_replaced_utils as _iru
-
-
-def home(root: Optional[str] = None) -> str:
-    if root is not None:
-        _iru._HOME = root
-        return _iru._HOME
-
-    root = os.getenv("TORCHVISION_DATASETS_HOME")
-    if root is not None:
-        return root
-
-    return _iru._HOME
-
-
-def use_sharded_dataset(use: Optional[bool] = None) -> bool:
-    if use is not None:
-        _iru._USE_SHARDED_DATASETS = use
-        return _iru._USE_SHARDED_DATASETS
-
-    use = os.getenv("TORCHVISION_SHARDED_DATASETS")
-    if use is not None:
-        return use == "1"
-
-    return _iru._USE_SHARDED_DATASETS
diff --git a/torchvision/prototype/datasets/benchmark.py b/torchvision/prototype/datasets/benchmark.py
deleted file mode 100644
index 104ef95c9ae..00000000000
--- a/torchvision/prototype/datasets/benchmark.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# type: ignore
-
-import argparse
-import collections.abc
-import contextlib
-import inspect
-import itertools
-import os
-import os.path
-import pathlib
-import shutil
-import sys
-import tempfile
-import time
-import unittest.mock
-import warnings
-
-import torch
-from torch.utils.data import DataLoader
-from torch.utils.data.dataloader_experimental import DataLoader2
-from torchvision import datasets as legacy_datasets
-from torchvision.datasets.utils import extract_archive
-from torchvision.prototype import datasets as new_datasets
-from torchvision.transforms import PILToTensor
-
-
-def main(
-    name,
-    *,
-    variant=None,
-    legacy=True,
-    new=True,
-    start=True,
-    iteration=True,
-    num_starts=3,
-    num_samples=10_000,
-    temp_root=None,
-    num_workers=0,
-):
-    benchmarks = [
-        benchmark
-        for benchmark in DATASET_BENCHMARKS
-        if benchmark.name == name and (variant is None or benchmark.variant == variant)
-    ]
-    if not benchmarks:
-        msg = f"No DatasetBenchmark available for dataset '{name}'"
-        if variant is not None:
-            msg += f" and variant '{variant}'"
-        raise ValueError(msg)
-
-    for benchmark in benchmarks:
-        print("#" * 80)
-        print(f"{benchmark.name}" + (f" ({benchmark.variant})" if benchmark.variant is not None else ""))
-
-        if legacy and start:
-            print(
-                "legacy",
-                "cold_start",
-                Measurement.time(benchmark.legacy_cold_start(temp_root, num_workers=num_workers), number=num_starts),
-            )
-            print(
-                "legacy",
-                "warm_start",
-                Measurement.time(benchmark.legacy_warm_start(temp_root, num_workers=num_workers), number=num_starts),
-            )
-
-        if legacy and iteration:
-            print(
-                "legacy",
-                "iteration",
-                Measurement.iterations_per_time(
-                    benchmark.legacy_iteration(temp_root, num_workers=num_workers, num_samples=num_samples)
-                ),
-            )
-
-        if new and start:
-            print(
-                "new",
-                "cold_start",
-                Measurement.time(benchmark.new_cold_start(num_workers=num_workers), number=num_starts),
-            )
-
-        if new and iteration:
-            print(
-                "new",
-                "iteration",
-                Measurement.iterations_per_time(
-                    benchmark.new_iteration(num_workers=num_workers, num_samples=num_samples)
-                ),
-            )
-
-
-class DatasetBenchmark:
-    def __init__(
-        self,
-        name: str,
-        *,
-        variant=None,
-        legacy_cls=None,
-        new_config=None,
-        legacy_config_map=None,
-        legacy_special_options_map=None,
-        prepare_legacy_root=None,
-    ):
-        self.name = name
-        self.variant = variant
-
-        self.new_raw_dataset = new_datasets._api.find(name)
-        self.legacy_cls = legacy_cls or self._find_legacy_cls()
-
-        if new_config is None:
-            new_config = self.new_raw_dataset.default_config
-        elif isinstance(new_config, dict):
-            new_config = self.new_raw_dataset.info.make_config(**new_config)
-        self.new_config = new_config
-
-        self.legacy_config_map = legacy_config_map
-        self.legacy_special_options_map = legacy_special_options_map or self._legacy_special_options_map
-        self.prepare_legacy_root = prepare_legacy_root
-
-    def new_dataset(self, *, num_workers=0):
-        return DataLoader2(new_datasets.load(self.name, **self.new_config), num_workers=num_workers)
-
-    def new_cold_start(self, *, num_workers):
-        def fn(timer):
-            with timer:
-                dataset = self.new_dataset(num_workers=num_workers)
-                next(iter(dataset))
-
-        return fn
-
-    def new_iteration(self, *, num_samples, num_workers):
-        def fn(timer):
-            dataset = self.new_dataset(num_workers=num_workers)
-            num_sample = 0
-            with timer:
-                for _ in dataset:
-                    num_sample += 1
-                    if num_sample == num_samples:
-                        break
-
-            return num_sample
-
-        return fn
-
-    def suppress_output(self):
-        @contextlib.contextmanager
-        def context_manager():
-            with open(os.devnull, "w") as devnull:
-                with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
-                    yield
-
-        return context_manager()
-
-    def legacy_dataset(self, root, *, num_workers=0, download=None):
-        legacy_config = self.legacy_config_map(self, root) if self.legacy_config_map else dict()
-
-        special_options = self.legacy_special_options_map(self)
-        if "download" in special_options and download is not None:
-            special_options["download"] = download
-
-        with self.suppress_output():
-            return DataLoader(
-                self.legacy_cls(legacy_config.pop("root", str(root)), **legacy_config, **special_options),
-                shuffle=True,
-                num_workers=num_workers,
-            )
-
-    @contextlib.contextmanager
-    def patch_download_and_integrity_checks(self):
-        patches = [
-            ("download_url", dict()),
-            ("download_file_from_google_drive", dict()),
-            ("check_integrity", dict(new=lambda path, md5=None: os.path.isfile(path))),
-        ]
-        dataset_module = sys.modules[self.legacy_cls.__module__]
-        utils_module = legacy_datasets.utils
-        with contextlib.ExitStack() as stack:
-            for name, patch_kwargs in patches:
-                patch_module = dataset_module if name in dir(dataset_module) else utils_module
-                stack.enter_context(unittest.mock.patch(f"{patch_module.__name__}.{name}", **patch_kwargs))
-
-            yield stack
-
-    def _find_resource_file_names(self):
-        info = self.new_raw_dataset.info
-        valid_options = info._valid_options
-
-        file_names = set()
-        for options in (
-            dict(zip(valid_options.keys(), values)) for values in itertools.product(*valid_options.values())
-        ):
-            resources = self.new_raw_dataset.resources(info.make_config(**options))
-            file_names.update([resource.file_name for resource in resources])
-
-        return file_names
-
-    @contextlib.contextmanager
-    def legacy_root(self, temp_root):
-        new_root = pathlib.Path(new_datasets.home()) / self.name
-        legacy_root = pathlib.Path(tempfile.mkdtemp(dir=temp_root))
-
-        if os.stat(new_root).st_dev != os.stat(legacy_root).st_dev:
-            warnings.warn(
-                "The temporary root directory for the legacy dataset was created on a different storage device than "
-                "the raw data that is used by the new dataset. If the devices have different I/O stats, this will "
-                "distort the benchmark. You can use the '--temp-root' flag to relocate the root directory of the "
-                "temporary directories.",
-                RuntimeWarning,
-            )
-
-        try:
-            for file_name in self._find_resource_file_names():
-                (legacy_root / file_name).symlink_to(new_root / file_name)
-
-            if self.prepare_legacy_root:
-                self.prepare_legacy_root(self, legacy_root)
-
-            with self.patch_download_and_integrity_checks():
-                yield legacy_root
-        finally:
-            shutil.rmtree(legacy_root)
-
-    def legacy_cold_start(self, temp_root, *, num_workers):
-        def fn(timer):
-            with self.legacy_root(temp_root) as root:
-                with timer:
-                    dataset = self.legacy_dataset(root, num_workers=num_workers)
-                    next(iter(dataset))
-
-        return fn
-
-    def legacy_warm_start(self, temp_root, *, num_workers):
-        def fn(timer):
-            with self.legacy_root(temp_root) as root:
-                self.legacy_dataset(root, num_workers=num_workers)
-                with timer:
-                    dataset = self.legacy_dataset(root, num_workers=num_workers, download=False)
-                    next(iter(dataset))
-
-        return fn
-
-    def legacy_iteration(self, temp_root, *, num_samples, num_workers):
-        def fn(timer):
-            with self.legacy_root(temp_root) as root:
-                dataset = self.legacy_dataset(root, num_workers=num_workers)
-                with timer:
-                    for num_sample, _ in enumerate(dataset, 1):
-                        if num_sample == num_samples:
-                            break
-
-            return num_sample
-
-        return fn
-
-    def _find_legacy_cls(self):
-        legacy_clss = {
-            name.lower(): dataset_class
-            for name, dataset_class in legacy_datasets.__dict__.items()
-            if isinstance(dataset_class, type) and issubclass(dataset_class, legacy_datasets.VisionDataset)
-        }
-        try:
-            return legacy_clss[self.name]
-        except KeyError as error:
-            raise RuntimeError(
-                f"Can't determine the legacy dataset class for '{self.name}' automatically. "
-                f"Please set the 'legacy_cls' keyword argument manually."
-            ) from error
-
-    _SPECIAL_KWARGS = {
-        "transform",
-        "target_transform",
-        "transforms",
-        "download",
-    }
-
-    @staticmethod
-    def _legacy_special_options_map(benchmark):
-        available_parameters = set()
-
-        for cls in benchmark.legacy_cls.__mro__:
-            if cls is legacy_datasets.VisionDataset:
-                break
-
-            available_parameters.update(inspect.signature(cls.__init__).parameters)
-
-        available_special_kwargs = benchmark._SPECIAL_KWARGS.intersection(available_parameters)
-
-        special_options = dict()
-
-        if "download" in available_special_kwargs:
-            special_options["download"] = True
-
-        if "transform" in available_special_kwargs:
-            special_options["transform"] = PILToTensor()
-            if "target_transform" in available_special_kwargs:
-                special_options["target_transform"] = torch.tensor
-        elif "transforms" in available_special_kwargs:
-            special_options["transforms"] = JointTransform(PILToTensor(), PILToTensor())
-
-        return special_options
-
-
-class Measurement:
-    @classmethod
-    def time(cls, fn, *, number):
-        results = Measurement._timeit(fn, number=number)
-        times = torch.tensor(tuple(zip(*results))[1])
-        return cls._format(times, unit="s")
-
-    @classmethod
-    def iterations_per_time(cls, fn):
-        num_samples, time = Measurement._timeit(fn, number=1)[0]
-        iterations_per_second = torch.tensor(num_samples) / torch.tensor(time)
-        return cls._format(iterations_per_second, unit="it/s")
-
-    class Timer:
-        def __init__(self):
-            self._start = None
-            self._stop = None
-
-        def __enter__(self):
-            self._start = time.perf_counter()
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            self._stop = time.perf_counter()
-
-        @property
-        def delta(self):
-            if self._start is None:
-                raise RuntimeError()
-            elif self._stop is None:
-                raise RuntimeError()
-            return self._stop - self._start
-
-    @classmethod
-    def _timeit(cls, fn, number):
-        results = []
-        for _ in range(number):
-            timer = cls.Timer()
-            output = fn(timer)
-            results.append((output, timer.delta))
-        return results
-
-    @classmethod
-    def _format(cls, measurements, *, unit):
-        measurements = torch.as_tensor(measurements).to(torch.float64).flatten()
-        if measurements.numel() == 1:
-            # TODO format that into engineering format
-            return f"{float(measurements):.3f} {unit}"
-
-        mean, std = Measurement._compute_mean_and_std(measurements)
-        # TODO format that into engineering format
-        return f"{mean:.3f} ± {std:.3f} {unit}"
-
-    @classmethod
-    def _compute_mean_and_std(cls, t):
-        mean = float(t.mean())
-        std = float(t.std(0, unbiased=t.numel() > 1))
-        return mean, std
-
-
-def no_split(benchmark, root):
-    legacy_config = dict(benchmark.new_config)
-    del legacy_config["split"]
-    return legacy_config
-
-
-def bool_split(name="train"):
-    def legacy_config_map(benchmark, root):
-        legacy_config = dict(benchmark.new_config)
-        legacy_config[name] = legacy_config.pop("split") == "train"
-        return legacy_config
-
-    return legacy_config_map
-
-
-def base_folder(rel_folder=None):
-    if rel_folder is None:
-
-        def rel_folder(benchmark):
-            return benchmark.name
-
-    elif not callable(rel_folder):
-        name = rel_folder
-
-        def rel_folder(_):
-            return name
-
-    def prepare_legacy_root(benchmark, root):
-        files = list(root.glob("*"))
-        folder = root / rel_folder(benchmark)
-        folder.mkdir(parents=True)
-        for file in files:
-            shutil.move(str(file), str(folder))
-
-        return folder
-
-    return prepare_legacy_root
-
-
-class JointTransform:
-    def __init__(self, *transforms):
-        self.transforms = transforms
-
-    def __call__(self, *inputs):
-        if len(inputs) == 1 and isinstance(inputs, collections.abc.Sequence):
-            inputs = inputs[0]
-
-        if len(inputs) != len(self.transforms):
-            raise RuntimeError(
-                f"The number of inputs and transforms mismatches: {len(inputs)} != {len(self.transforms)}."
-            )
-
-        return tuple(transform(input) for transform, input in zip(self.transforms, inputs))
-
-
-def caltech101_legacy_config_map(benchmark, root):
-    legacy_config = no_split(benchmark, root)
-    # The new dataset always returns the category and annotation
-    legacy_config["target_type"] = ("category", "annotation")
-    return legacy_config
-
-
-mnist_base_folder = base_folder(lambda benchmark: pathlib.Path(benchmark.legacy_cls.__name__) / "raw")
-
-
-def mnist_legacy_config_map(benchmark, root):
-    return dict(train=benchmark.new_config.split == "train")
-
-
-def emnist_prepare_legacy_root(benchmark, root):
-    folder = mnist_base_folder(benchmark, root)
-    shutil.move(str(folder / "emnist-gzip.zip"), str(folder / "gzip.zip"))
-    return folder
-
-
-def emnist_legacy_config_map(benchmark, root):
-    legacy_config = mnist_legacy_config_map(benchmark, root)
-    legacy_config["split"] = benchmark.new_config.image_set.replace("_", "").lower()
-    return legacy_config
-
-
-def qmnist_legacy_config_map(benchmark, root):
-    legacy_config = mnist_legacy_config_map(benchmark, root)
-    legacy_config["what"] = benchmark.new_config.split
-    # The new dataset always returns the full label
-    legacy_config["compat"] = False
-    return legacy_config
-
-
-def coco_legacy_config_map(benchmark, root):
-    images, _ = benchmark.new_raw_dataset.resources(benchmark.new_config)
-    return dict(
-        root=str(root / pathlib.Path(images.file_name).stem),
-        annFile=str(
-            root / "annotations" / f"{benchmark.variant}_{benchmark.new_config.split}{benchmark.new_config.year}.json"
-        ),
-    )
-
-
-def coco_prepare_legacy_root(benchmark, root):
-    images, annotations = benchmark.new_raw_dataset.resources(benchmark.new_config)
-    extract_archive(str(root / images.file_name))
-    extract_archive(str(root / annotations.file_name))
-
-
-DATASET_BENCHMARKS = [
-    DatasetBenchmark(
-        "caltech101",
-        legacy_config_map=caltech101_legacy_config_map,
-        prepare_legacy_root=base_folder(),
-        legacy_special_options_map=lambda config: dict(
-            download=True,
-            transform=PILToTensor(),
-            target_transform=JointTransform(torch.tensor, torch.tensor),
-        ),
-    ),
-    DatasetBenchmark(
-        "caltech256",
-        legacy_config_map=no_split,
-        prepare_legacy_root=base_folder(),
-    ),
-    DatasetBenchmark(
-        "celeba",
-        prepare_legacy_root=base_folder(),
-        legacy_config_map=lambda benchmark: dict(
-            split="valid" if benchmark.new_config.split == "val" else benchmark.new_config.split,
-            # The new dataset always returns all annotations
-            target_type=("attr", "identity", "bbox", "landmarks"),
-        ),
-    ),
-    DatasetBenchmark(
-        "cifar10",
-        legacy_config_map=bool_split(),
-    ),
-    DatasetBenchmark(
-        "cifar100",
-        legacy_config_map=bool_split(),
-    ),
-    DatasetBenchmark(
-        "emnist",
-        prepare_legacy_root=emnist_prepare_legacy_root,
-        legacy_config_map=emnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "fashionmnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "kmnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "mnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "qmnist",
-        prepare_legacy_root=mnist_base_folder,
-        legacy_config_map=mnist_legacy_config_map,
-    ),
-    DatasetBenchmark(
-        "sbd",
-        legacy_cls=legacy_datasets.SBDataset,
-        legacy_config_map=lambda benchmark: dict(
-            image_set=benchmark.new_config.split,
-            mode="boundaries" if benchmark.new_config.boundaries else "segmentation",
-        ),
-        legacy_special_options_map=lambda benchmark: dict(
-            download=True,
-            transforms=JointTransform(
-                PILToTensor(), torch.tensor if benchmark.new_config.boundaries else PILToTensor()
-            ),
-        ),
-    ),
-    DatasetBenchmark("voc", legacy_cls=legacy_datasets.VOCDetection),
-    DatasetBenchmark("imagenet", legacy_cls=legacy_datasets.ImageNet),
-    DatasetBenchmark(
-        "coco",
-        variant="instances",
-        legacy_cls=legacy_datasets.CocoDetection,
-        new_config=dict(split="train", annotations="instances"),
-        legacy_config_map=coco_legacy_config_map,
-        prepare_legacy_root=coco_prepare_legacy_root,
-        legacy_special_options_map=lambda benchmark: dict(transform=PILToTensor(), target_transform=None),
-    ),
-    DatasetBenchmark(
-        "coco",
-        variant="captions",
-        legacy_cls=legacy_datasets.CocoCaptions,
-        new_config=dict(split="train", annotations="captions"),
-        legacy_config_map=coco_legacy_config_map,
-        prepare_legacy_root=coco_prepare_legacy_root,
-        legacy_special_options_map=lambda benchmark: dict(transform=PILToTensor(), target_transform=None),
-    ),
-]
-
-
-def parse_args(argv=None):
-    parser = argparse.ArgumentParser(
-        prog="torchvision.prototype.datasets.benchmark.py",
-        description="Utility to benchmark new datasets against their legacy variants.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    parser.add_argument("name", help="Name of the dataset to benchmark.")
-    parser.add_argument(
-        "--variant", help="Variant of the dataset. If omitted all available variants will be benchmarked."
-    )
-
-    parser.add_argument(
-        "-n",
-        "--num-starts",
-        type=int,
-        default=3,
-        help="Number of warm and cold starts of each benchmark. Default to 3.",
-    )
-    parser.add_argument(
-        "-N",
-        "--num-samples",
-        type=int,
-        default=10_000,
-        help="Maximum number of samples to draw during iteration benchmarks. Defaults to 10_000.",
-    )
-
-    parser.add_argument(
-        "--nl",
-        "--no-legacy",
-        dest="legacy",
-        action="store_false",
-        help="Skip legacy benchmarks.",
-    )
-    parser.add_argument(
-        "--nn",
-        "--no-new",
-        dest="new",
-        action="store_false",
-        help="Skip new benchmarks.",
-    )
-    parser.add_argument(
-        "--ns",
-        "--no-start",
-        dest="start",
-        action="store_false",
-        help="Skip start benchmarks.",
-    )
-    parser.add_argument(
-        "--ni",
-        "--no-iteration",
-        dest="iteration",
-        action="store_false",
-        help="Skip iteration benchmarks.",
-    )
-
-    parser.add_argument(
-        "-t",
-        "--temp-root",
-        type=pathlib.Path,
-        help=(
-            "Root of the temporary legacy root directories. Use this if your system default temporary directory is on "
-            "another storage device as the raw data to avoid distortions due to differing I/O stats."
-        ),
-    )
-    parser.add_argument(
-        "-j",
-        "--num-workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses used to load the data. Setting this to 0 (default) will load all data in the main "
-            "process and thus disable multi-processing."
-        ),
-    )
-
-    return parser.parse_args(argv or sys.argv[1:])
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    try:
-        main(
-            args.name,
-            variant=args.variant,
-            legacy=args.legacy,
-            new=args.new,
-            start=args.start,
-            iteration=args.iteration,
-            num_starts=args.num_starts,
-            num_samples=args.num_samples,
-            temp_root=args.temp_root,
-            num_workers=args.num_workers,
-        )
-    except Exception as error:
-        msg = str(error)
-        print(msg or f"Unspecified {type(error)} was raised during execution.", file=sys.stderr)
-        sys.exit(1)
diff --git a/torchvision/prototype/datasets/generate_category_files.py b/torchvision/prototype/datasets/generate_category_files.py
deleted file mode 100644
index 6d4e854fe34..00000000000
--- a/torchvision/prototype/datasets/generate_category_files.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# type: ignore
-
-import argparse
-import csv
-import sys
-
-from torchvision.prototype import datasets
-from torchvision.prototype.datasets.utils._internal import BUILTIN_DIR
-
-
-def main(*names, force=False):
-    for name in names:
-        path = BUILTIN_DIR / f"{name}.categories"
-        if path.exists() and not force:
-            continue
-
-        dataset = datasets.load(name)
-        try:
-            categories = dataset._generate_categories()
-        except NotImplementedError:
-            continue
-
-        with open(path, "w") as file:
-            writer = csv.writer(file, lineterminator="\n")
-            for category in categories:
-                writer.writerow((category,) if isinstance(category, str) else category)
-
-
-def parse_args(argv=None):
-    parser = argparse.ArgumentParser(prog="torchvision.prototype.datasets.generate_category_files.py")
-
-    parser.add_argument(
-        "names",
-        nargs="*",
-        type=str,
-        help="Names of datasets to generate category files for. If omitted, all datasets will be used.",
-    )
-    parser.add_argument(
-        "-f",
-        "--force",
-        action="store_true",
-        help="Force regeneration of category files.",
-    )
-
-    args = parser.parse_args(argv or sys.argv[1:])
-
-    if not args.names:
-        args.names = datasets.list_datasets()
-
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    try:
-        main(*args.names, force=args.force)
-    except Exception as error:
-        msg = str(error)
-        print(msg or f"Unspecified {type(error)} was raised during execution.", file=sys.stderr)
-        sys.exit(1)
diff --git a/torchvision/prototype/datasets/utils/__init__.py b/torchvision/prototype/datasets/utils/__init__.py
deleted file mode 100644
index 41ccbf48951..00000000000
--- a/torchvision/prototype/datasets/utils/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from . import _internal  # usort: skip
-from ._dataset import Dataset
-from ._resource import GDriveResource, HttpResource, KaggleDownloadResource, ManualDownloadResource, OnlineResource
diff --git a/torchvision/prototype/datasets/utils/_dataset.py b/torchvision/prototype/datasets/utils/_dataset.py
deleted file mode 100644
index e7486c854ac..00000000000
--- a/torchvision/prototype/datasets/utils/_dataset.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import abc
-import importlib
-import pathlib
-from typing import Any, Collection, Dict, Iterator, List, Optional, Sequence, Union
-
-from torch.utils.data import IterDataPipe
-from torchvision.datasets.utils import verify_str_arg
-
-from ._resource import OnlineResource
-
-
-class Dataset(IterDataPipe[Dict[str, Any]], abc.ABC):
-    @staticmethod
-    def _verify_str_arg(
-        value: str,
-        arg: Optional[str] = None,
-        valid_values: Optional[Collection[str]] = None,
-        *,
-        custom_msg: Optional[str] = None,
-    ) -> str:
-        return verify_str_arg(value, arg, valid_values, custom_msg=custom_msg)
-
-    def __init__(
-        self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False, dependencies: Collection[str] = ()
-    ) -> None:
-        for dependency in dependencies:
-            try:
-                importlib.import_module(dependency)
-            except ModuleNotFoundError:
-                raise ModuleNotFoundError(
-                    f"{type(self).__name__}() depends on the third-party package '{dependency}'. "
-                    f"Please install it, for example with `pip install {dependency}`."
-                ) from None
-
-        self._root = pathlib.Path(root).expanduser().resolve()
-        resources = [
-            resource.load(self._root, skip_integrity_check=skip_integrity_check) for resource in self._resources()
-        ]
-        self._dp = self._datapipe(resources)
-
-    def __iter__(self) -> Iterator[Dict[str, Any]]:
-        yield from self._dp
-
-    @abc.abstractmethod
-    def _resources(self) -> List[OnlineResource]:
-        pass
-
-    @abc.abstractmethod
-    def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
-        pass
-
-    @abc.abstractmethod
-    def __len__(self) -> int:
-        pass
-
-    def _generate_categories(self) -> Sequence[Union[str, Sequence[str]]]:
-        raise NotImplementedError
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
deleted file mode 100644
index 0385d98c2f5..00000000000
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import csv
-import functools
-import pathlib
-import pickle
-from typing import Any, BinaryIO, Callable, Dict, IO, Iterator, List, Sequence, Sized, Tuple, TypeVar, Union
-
-import torch
-import torch.distributed as dist
-import torch.utils.data
-from torchdata.datapipes.iter import IoPathFileLister, IoPathFileOpener, IterDataPipe, ShardingFilter, Shuffler
-from torchdata.datapipes.utils import StreamWrapper
-from torchvision.prototype.utils._internal import fromfile
-
-
-__all__ = [
-    "INFINITE_BUFFER_SIZE",
-    "BUILTIN_DIR",
-    "read_mat",
-    "MappingIterator",
-    "getitem",
-    "path_accessor",
-    "path_comparator",
-    "read_flo",
-    "hint_sharding",
-    "hint_shuffling",
-]
-
-K = TypeVar("K")
-D = TypeVar("D")
-
-# pseudo-infinite until a true infinite buffer is supported by all datapipes
-INFINITE_BUFFER_SIZE = 1_000_000_000
-
-BUILTIN_DIR = pathlib.Path(__file__).parent.parent / "_builtin"
-
-
-def read_mat(buffer: BinaryIO, **kwargs: Any) -> Any:
-    try:
-        import scipy.io as sio
-    except ImportError as error:
-        raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error
-
-    if isinstance(buffer, StreamWrapper):
-        buffer = buffer.file_obj
-
-    return sio.loadmat(buffer, **kwargs)
-
-
-class MappingIterator(IterDataPipe[Union[Tuple[K, D], D]]):
-    def __init__(self, datapipe: IterDataPipe[Dict[K, D]], *, drop_key: bool = False) -> None:
-        self.datapipe = datapipe
-        self.drop_key = drop_key
-
-    def __iter__(self) -> Iterator[Union[Tuple[K, D], D]]:
-        for mapping in self.datapipe:
-            yield from iter(mapping.values() if self.drop_key else mapping.items())
-
-
-def _getitem_closure(obj: Any, *, items: Sequence[Any]) -> Any:
-    for item in items:
-        obj = obj[item]
-    return obj
-
-
-def getitem(*items: Any) -> Callable[[Any], Any]:
-    return functools.partial(_getitem_closure, items=items)
-
-
-def _getattr_closure(obj: Any, *, attrs: Sequence[str]) -> Any:
-    for attr in attrs:
-        obj = getattr(obj, attr)
-    return obj
-
-
-def _path_attribute_accessor(path: pathlib.Path, *, name: str) -> Any:
-    return _getattr_closure(path, attrs=name.split("."))
-
-
-def _path_accessor_closure(data: Tuple[str, Any], *, getter: Callable[[pathlib.Path], D]) -> D:
-    return getter(pathlib.Path(data[0]))
-
-
-def path_accessor(getter: Union[str, Callable[[pathlib.Path], D]]) -> Callable[[Tuple[str, Any]], D]:
-    if isinstance(getter, str):
-        getter = functools.partial(_path_attribute_accessor, name=getter)
-
-    return functools.partial(_path_accessor_closure, getter=getter)
-
-
-def _path_comparator_closure(data: Tuple[str, Any], *, accessor: Callable[[Tuple[str, Any]], D], value: D) -> bool:
-    return accessor(data) == value
-
-
-def path_comparator(getter: Union[str, Callable[[pathlib.Path], D]], value: D) -> Callable[[Tuple[str, Any]], bool]:
-    return functools.partial(_path_comparator_closure, accessor=path_accessor(getter), value=value)
-
-
-class PicklerDataPipe(IterDataPipe):
-    def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO[bytes]]]) -> None:
-        self.source_datapipe = source_datapipe
-
-    def __iter__(self) -> Iterator[Any]:
-        for _, fobj in self.source_datapipe:
-            data = pickle.load(fobj)
-            for _, d in enumerate(data):
-                yield d
-
-
-class SharderDataPipe(torch.utils.data.datapipes.iter.grouping.ShardingFilterIterDataPipe):
-    def __init__(self, source_datapipe: IterDataPipe) -> None:
-        super().__init__(source_datapipe)
-        self.rank = 0
-        self.world_size = 1
-        if dist.is_available() and dist.is_initialized():
-            self.rank = dist.get_rank()
-            self.world_size = dist.get_world_size()
-        self.apply_sharding(self.world_size, self.rank)
-
-    def __iter__(self) -> Iterator[Any]:
-        num_workers = self.world_size
-        worker_id = self.rank
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is not None:
-            worker_id = worker_id + worker_info.id * num_workers
-            num_workers *= worker_info.num_workers
-        self.apply_sharding(num_workers, worker_id)
-        yield from super().__iter__()
-
-
-class TakerDataPipe(IterDataPipe):
-    def __init__(self, source_datapipe: IterDataPipe, num_take: int) -> None:
-        super().__init__()
-        self.source_datapipe = source_datapipe
-        self.num_take = num_take
-        self.world_size = 1
-        if dist.is_available() and dist.is_initialized():
-            self.world_size = dist.get_world_size()
-
-    def __iter__(self) -> Iterator[Any]:
-        num_workers = self.world_size
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is not None:
-            num_workers *= worker_info.num_workers
-
-        # TODO: this is weird as it drops more elements than it should
-        num_take = self.num_take // num_workers
-
-        for i, data in enumerate(self.source_datapipe):
-            if i < num_take:
-                yield data
-            else:
-                break
-
-    def __len__(self) -> int:
-        num_take = self.num_take // self.world_size
-        if isinstance(self.source_datapipe, Sized):
-            if len(self.source_datapipe) < num_take:
-                num_take = len(self.source_datapipe)
-        # TODO: might be weird to not take `num_workers` into account
-        return num_take
-
-
-def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe[Dict[str, Any]]:
-    dp = IoPathFileLister(root=root)
-    dp = SharderDataPipe(dp)
-    dp = dp.shuffle(buffer_size=INFINITE_BUFFER_SIZE)
-    dp = IoPathFileOpener(dp, mode="rb")
-    dp = PicklerDataPipe(dp)
-    # dp = dp.cycle(2)
-    dp = TakerDataPipe(dp, dataset_size)
-    return dp
-
-
-def read_flo(file: BinaryIO) -> torch.Tensor:
-    if file.read(4) != b"PIEH":
-        raise ValueError("Magic number incorrect. Invalid .flo file")
-
-    width, height = fromfile(file, dtype=torch.int32, byte_order="little", count=2)
-    flow = fromfile(file, dtype=torch.float32, byte_order="little", count=height * width * 2)
-    return flow.reshape((height, width, 2)).permute((2, 0, 1))
-
-
-def hint_sharding(datapipe: IterDataPipe) -> ShardingFilter:
-    return ShardingFilter(datapipe)
-
-
-def hint_shuffling(datapipe: IterDataPipe[D]) -> Shuffler[D]:
-    return Shuffler(datapipe, buffer_size=INFINITE_BUFFER_SIZE).set_shuffle(False)
-
-
-def read_categories_file(name: str) -> List[Union[str, Sequence[str]]]:
-    path = BUILTIN_DIR / f"{name}.categories"
-    with open(path, newline="") as file:
-        rows = list(csv.reader(file))
-        rows = [row[0] if len(row) == 1 else row for row in rows]
-        return rows
diff --git a/torchvision/prototype/datasets/utils/_resource.py b/torchvision/prototype/datasets/utils/_resource.py
deleted file mode 100644
index dc01c72de28..00000000000
--- a/torchvision/prototype/datasets/utils/_resource.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import abc
-import hashlib
-import itertools
-import pathlib
-from typing import Any, Callable, IO, NoReturn, Optional, Sequence, Set, Tuple, Union
-from urllib.parse import urlparse
-
-from torchdata.datapipes.iter import (
-    FileLister,
-    FileOpener,
-    IterableWrapper,
-    IterDataPipe,
-    RarArchiveLoader,
-    TarArchiveLoader,
-    ZipArchiveLoader,
-)
-from torchvision.datasets.utils import (
-    _decompress,
-    _detect_file_type,
-    _get_google_drive_file_id,
-    _get_redirect_url,
-    download_file_from_google_drive,
-    download_url,
-    extract_archive,
-)
-from typing_extensions import Literal
-
-
-class OnlineResource(abc.ABC):
-    def __init__(
-        self,
-        *,
-        file_name: str,
-        sha256: Optional[str] = None,
-        preprocess: Optional[Union[Literal["decompress", "extract"], Callable[[pathlib.Path], None]]] = None,
-    ) -> None:
-        self.file_name = file_name
-        self.sha256 = sha256
-
-        if isinstance(preprocess, str):
-            if preprocess == "decompress":
-                preprocess = self._decompress
-            elif preprocess == "extract":
-                preprocess = self._extract
-            else:
-                raise ValueError(
-                    f"Only `'decompress'` or `'extract'` are valid if `preprocess` is passed as string,"
-                    f"but got {preprocess} instead."
-                )
-        self._preprocess = preprocess
-
-    @staticmethod
-    def _extract(file: pathlib.Path) -> None:
-        extract_archive(str(file), to_path=str(file).replace("".join(file.suffixes), ""), remove_finished=False)
-
-    @staticmethod
-    def _decompress(file: pathlib.Path) -> None:
-        _decompress(str(file), remove_finished=True)
-
-    def _loader(self, path: pathlib.Path) -> IterDataPipe[Tuple[str, IO]]:
-        if path.is_dir():
-            return FileOpener(FileLister(str(path), recursive=True), mode="rb")
-
-        dp = FileOpener(IterableWrapper((str(path),)), mode="rb")
-
-        archive_loader = self._guess_archive_loader(path)
-        if archive_loader:
-            dp = archive_loader(dp)
-
-        return dp
-
-    _ARCHIVE_LOADERS = {
-        ".tar": TarArchiveLoader,
-        ".zip": ZipArchiveLoader,
-        ".rar": RarArchiveLoader,
-    }
-
-    def _guess_archive_loader(
-        self, path: pathlib.Path
-    ) -> Optional[Callable[[IterDataPipe[Tuple[str, IO]]], IterDataPipe[Tuple[str, IO]]]]:
-        try:
-            _, archive_type, _ = _detect_file_type(path.name)
-        except RuntimeError:
-            return None
-        return self._ARCHIVE_LOADERS.get(archive_type)  # type: ignore[arg-type]
-
-    def load(
-        self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False
-    ) -> IterDataPipe[Tuple[str, IO]]:
-        root = pathlib.Path(root)
-        path = root / self.file_name
-
-        # Instead of the raw file, there might also be files with fewer suffixes after decompression or directories
-        # with no suffixes at all. `pathlib.Path().stem` will only give us the name with the last suffix removed, which
-        # is not sufficient for files with multiple suffixes, e.g. foo.tar.gz.
-        stem = path.name.replace("".join(path.suffixes), "")
-
-        def find_candidates() -> Set[pathlib.Path]:
-            # Although it looks like we could glob for f"{stem}*" to find the file candidates as well as the folder
-            # candidate simultaneously, that would also pick up other files that share the same prefix. For example, the
-            # test split of the stanford-cars dataset uses the files
-            # - cars_test.tgz
-            # - cars_test_annos_withlabels.mat
-            # Globbing for `"cars_test*"` picks up both.
-            candidates = {file for file in path.parent.glob(f"{stem}.*")}
-            folder_candidate = path.parent / stem
-            if folder_candidate.exists():
-                candidates.add(folder_candidate)
-
-            return candidates
-
-        candidates = find_candidates()
-
-        if not candidates:
-            self.download(root, skip_integrity_check=skip_integrity_check)
-            if self._preprocess is not None:
-                self._preprocess(path)
-            candidates = find_candidates()
-
-        # We use the path with the fewest suffixes. This gives us the
-        # extracted > decompressed > raw
-        # priority that we want for the best I/O performance.
-        return self._loader(min(candidates, key=lambda candidate: len(candidate.suffixes)))
-
-    @abc.abstractmethod
-    def _download(self, root: pathlib.Path) -> None:
-        pass
-
-    def download(self, root: Union[str, pathlib.Path], *, skip_integrity_check: bool = False) -> pathlib.Path:
-        root = pathlib.Path(root)
-        self._download(root)
-        path = root / self.file_name
-        if self.sha256 and not skip_integrity_check:
-            self._check_sha256(path)
-        return path
-
-    def _check_sha256(self, path: pathlib.Path, *, chunk_size: int = 1024 * 1024) -> None:
-        hash = hashlib.sha256()
-        with open(path, "rb") as file:
-            for chunk in iter(lambda: file.read(chunk_size), b""):
-                hash.update(chunk)
-        sha256 = hash.hexdigest()
-        if sha256 != self.sha256:
-            raise RuntimeError(
-                f"After the download, the SHA256 checksum of {path} didn't match the expected one: "
-                f"{sha256} != {self.sha256}"
-            )
-
-
-class HttpResource(OnlineResource):
-    def __init__(
-        self, url: str, *, file_name: Optional[str] = None, mirrors: Sequence[str] = (), **kwargs: Any
-    ) -> None:
-        super().__init__(file_name=file_name or pathlib.Path(urlparse(url).path).name, **kwargs)
-        self.url = url
-        self.mirrors = mirrors
-        self._resolved = False
-
-    def resolve(self) -> OnlineResource:
-        if self._resolved:
-            return self
-
-        redirect_url = _get_redirect_url(self.url)
-        if redirect_url == self.url:
-            self._resolved = True
-            return self
-
-        meta = {
-            attr.lstrip("_"): getattr(self, attr)
-            for attr in (
-                "file_name",
-                "sha256",
-                "_preprocess",
-            )
-        }
-
-        gdrive_id = _get_google_drive_file_id(redirect_url)
-        if gdrive_id:
-            return GDriveResource(gdrive_id, **meta)
-
-        http_resource = HttpResource(redirect_url, **meta)
-        http_resource._resolved = True
-        return http_resource
-
-    def _download(self, root: pathlib.Path) -> None:
-        if not self._resolved:
-            return self.resolve()._download(root)
-
-        for url in itertools.chain((self.url,), self.mirrors):
-
-            try:
-                download_url(url, str(root), filename=self.file_name, md5=None)
-            # TODO: make this more precise
-            except Exception:
-                continue
-
-            return
-        else:
-            # TODO: make this more informative
-            raise RuntimeError("Download failed!")
-
-
-class GDriveResource(OnlineResource):
-    def __init__(self, id: str, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-        self.id = id
-
-    def _download(self, root: pathlib.Path) -> None:
-        download_file_from_google_drive(self.id, root=str(root), filename=self.file_name, md5=None)
-
-
-class ManualDownloadResource(OnlineResource):
-    def __init__(self, instructions: str, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-        self.instructions = instructions
-
-    def _download(self, root: pathlib.Path) -> NoReturn:
-        raise RuntimeError(
-            f"The file {self.file_name} cannot be downloaded automatically. "
-            f"Please follow the instructions below and place it in {root}\n\n"
-            f"{self.instructions}"
-        )
-
-
-class KaggleDownloadResource(ManualDownloadResource):
-    def __init__(self, challenge_url: str, *, file_name: str, **kwargs: Any) -> None:
-        instructions = "\n".join(
-            (
-                "1. Register and login at https://www.kaggle.com",
-                f"2. Navigate to {challenge_url}",
-                "3. Click 'Join Competition' and follow the instructions there",
-                "4. Navigate to the 'Data' tab",
-                f"5. Select {file_name} in the 'Data Explorer' and click the download button",
-            )
-        )
-        super().__init__(instructions, file_name=file_name, **kwargs)
diff --git a/torchvision/prototype/features/__init__.py b/torchvision/prototype/features/__init__.py
deleted file mode 100644
index df77e8b77b3..00000000000
--- a/torchvision/prototype/features/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from ._bounding_box import BoundingBox, BoundingBoxFormat
-from ._encoded import EncodedData, EncodedImage, EncodedVideo
-from ._feature import _Feature, FillType, FillTypeJIT, InputType, InputTypeJIT, is_simple_tensor
-from ._image import (
-    ColorSpace,
-    Image,
-    ImageType,
-    ImageTypeJIT,
-    LegacyImageType,
-    LegacyImageTypeJIT,
-    TensorImageType,
-    TensorImageTypeJIT,
-)
-from ._label import Label, OneHotLabel
-from ._mask import Mask
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
deleted file mode 100644
index 9ccd4fa62ad..00000000000
--- a/torchvision/prototype/features/_bounding_box.py
+++ /dev/null
@@ -1,176 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, List, Optional, Sequence, Tuple, Union
-
-import torch
-from torchvision._utils import StrEnum
-from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
-
-from ._feature import _Feature, FillTypeJIT
-
-
-class BoundingBoxFormat(StrEnum):
-    XYXY = StrEnum.auto()
-    XYWH = StrEnum.auto()
-    CXCYWH = StrEnum.auto()
-
-
-class BoundingBox(_Feature):
-    format: BoundingBoxFormat
-    image_size: Tuple[int, int]
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        format: Union[BoundingBoxFormat, str],
-        image_size: Tuple[int, int],
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> BoundingBox:
-        bounding_box = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
-        bounding_box.format = format
-
-        bounding_box.image_size = image_size
-
-        return bounding_box
-
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, image_size=self.image_size)
-
-    @classmethod
-    def new_like(
-        cls,
-        other: BoundingBox,
-        data: Any,
-        *,
-        format: Optional[Union[BoundingBoxFormat, str]] = None,
-        image_size: Optional[Tuple[int, int]] = None,
-        **kwargs: Any,
-    ) -> BoundingBox:
-        return super().new_like(
-            other,
-            data,
-            format=format if format is not None else other.format,
-            image_size=image_size if image_size is not None else other.image_size,
-            **kwargs,
-        )
-
-    def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
-        if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
-
-        return BoundingBox.new_like(
-            self, self._F.convert_format_bounding_box(self, old_format=self.format, new_format=format), format=format
-        )
-
-    def horizontal_flip(self) -> BoundingBox:
-        output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size)
-        return BoundingBox.new_like(self, output)
-
-    def vertical_flip(self) -> BoundingBox:
-        output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size)
-        return BoundingBox.new_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> BoundingBox:
-        output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size)
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        output, image_size = self._F.crop_bounding_box(
-            self, self.format, top=top, left=left, height=height, width=width
-        )
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def center_crop(self, output_size: List[int]) -> BoundingBox:
-        output, image_size = self._F.center_crop_bounding_box(
-            self, format=self.format, image_size=self.image_size, output_size=output_size
-        )
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
-    ) -> BoundingBox:
-        output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def pad(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: FillTypeJIT = None,
-        padding_mode: str = "constant",
-    ) -> BoundingBox:
-        output, image_size = self._F.pad_bounding_box(
-            self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode
-        )
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> BoundingBox:
-        output, image_size = self._F.rotate_bounding_box(
-            self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center
-        )
-        return BoundingBox.new_like(self, output, image_size=image_size)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> BoundingBox:
-        output = self._F.affine_bounding_box(
-            self,
-            self.format,
-            self.image_size,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            center=center,
-        )
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
-    ) -> BoundingBox:
-        output = self._F.perspective_bounding_box(self, self.format, perspective_coeffs)
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
-    ) -> BoundingBox:
-        output = self._F.elastic_bounding_box(self, self.format, displacement)
-        return BoundingBox.new_like(self, output, dtype=output.dtype)
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
deleted file mode 100644
index b8b9839600f..00000000000
--- a/torchvision/prototype/features/_encoded.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from __future__ import annotations
-
-import os
-import sys
-from typing import Any, BinaryIO, Optional, Tuple, Type, TypeVar, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer
-
-from ._feature import _Feature
-
-D = TypeVar("D", bound="EncodedData")
-
-
-class EncodedData(_Feature):
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> EncodedData:
-        # TODO: warn / bail out if we encounter a tensor with shape other than (N,) or with dtype other than uint8?
-        return super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-    @classmethod
-    def from_file(cls: Type[D], file: BinaryIO, **kwargs: Any) -> D:
-        return cls(fromfile(file, dtype=torch.uint8, byte_order=sys.byteorder), **kwargs)
-
-    @classmethod
-    def from_path(cls: Type[D], path: Union[str, os.PathLike], **kwargs: Any) -> D:
-        with open(path, "rb") as file:
-            return cls.from_file(file, **kwargs)
-
-
-class EncodedImage(EncodedData):
-    # TODO: Use @functools.cached_property if we can depend on Python 3.8
-    @property
-    def image_size(self) -> Tuple[int, int]:
-        if not hasattr(self, "_image_size"):
-            with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image:
-                self._image_size = image.height, image.width
-
-        return self._image_size
-
-
-class EncodedVideo(EncodedData):
-    pass
diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
deleted file mode 100644
index 9c0cece15be..00000000000
--- a/torchvision/prototype/features/_feature.py
+++ /dev/null
@@ -1,259 +0,0 @@
-from __future__ import annotations
-
-from types import ModuleType
-from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
-
-import PIL.Image
-import torch
-from torch._C import DisableTorchFunction
-from torchvision.transforms import InterpolationMode
-
-
-F = TypeVar("F", bound="_Feature")
-FillType = Union[int, float, Sequence[int], Sequence[float], None]
-FillTypeJIT = Union[int, float, List[float], None]
-
-
-def is_simple_tensor(inpt: Any) -> bool:
-    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, _Feature)
-
-
-class _Feature(torch.Tensor):
-    __F: Optional[ModuleType] = None
-
-    def __new__(
-        cls: Type[F],
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> F:
-        return (
-            torch.as_tensor(  # type: ignore[return-value]
-                data,
-                dtype=dtype,  # type: ignore[arg-type]
-                device=device,  # type: ignore[arg-type]
-            )
-            .as_subclass(cls)  # type: ignore[arg-type]
-            .requires_grad_(requires_grad)
-        )
-
-    @classmethod
-    def new_like(
-        cls: Type[F],
-        other: F,
-        data: Any,
-        *,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: Optional[bool] = None,
-        **kwargs: Any,
-    ) -> F:
-        return cls(
-            data,
-            dtype=dtype if dtype is not None else other.dtype,
-            device=device if device is not None else other.device,
-            requires_grad=requires_grad if requires_grad is not None else other.requires_grad,
-            **kwargs,
-        )
-
-    _NO_WRAPPING_EXCEPTIONS = {
-        torch.Tensor.clone: lambda cls, input, output: cls.new_like(input, output),
-        torch.Tensor.to: lambda cls, input, output: cls.new_like(
-            input, output, dtype=output.dtype, device=output.device
-        ),
-        # We don't need to wrap the output of `Tensor.requires_grad_`, since it is an inplace operation and thus
-        # retains the type automatically
-        torch.Tensor.requires_grad_: lambda cls, input, output: output,
-    }
-
-    @classmethod
-    def __torch_function__(
-        cls,
-        func: Callable[..., torch.Tensor],
-        types: Tuple[Type[torch.Tensor], ...],
-        args: Sequence[Any] = (),
-        kwargs: Optional[Mapping[str, Any]] = None,
-    ) -> torch.Tensor:
-        """For general information about how the __torch_function__ protocol works,
-        see https://pytorch.org/docs/stable/notes/extending.html#extending-torch
-
-        TL;DR: Every time a PyTorch operator is called, it goes through the inputs and looks for the
-        ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
-        ``args`` and ``kwargs`` of the original call.
-
-        The default behavior of :class:`~torch.Tensor`'s is to retain a custom tensor type. For the :class:`_Feature`
-        use case, this has two downsides:
-
-        1. Since some :class:`Feature`'s require metadata to be constructed, the default wrapping, i.e.
-           ``return cls(func(*args, **kwargs))``, will fail for them.
-        2. For most operations, there is no way of knowing if the input type is still valid for the output.
-
-        For these reasons, the automatic output wrapping is turned off for most operators. The only exceptions are
-        listed in :attr:`~_Feature._NO_WRAPPING_EXCEPTIONS`
-        """
-        # Since super().__torch_function__ has no hook to prevent the coercing of the output into the input type, we
-        # need to reimplement the functionality.
-
-        if not all(issubclass(cls, t) for t in types):
-            return NotImplemented
-
-        with DisableTorchFunction():
-            output = func(*args, **kwargs or dict())
-
-            wrapper = cls._NO_WRAPPING_EXCEPTIONS.get(func)
-            # Apart from `func` needing to be an exception, we also require the primary operand, i.e. `args[0]`, to be
-            # an instance of the class that `__torch_function__` was invoked on. The __torch_function__ protocol will
-            # invoke this method on *all* types involved in the computation by walking the MRO upwards. For example,
-            # `torch.Tensor(...).to(features.Image(...))` will invoke `features.Image.__torch_function__` with
-            # `args = (torch.Tensor(), features.Image())` first. Without this guard, the original `torch.Tensor` would
-            # be wrapped into a `features.Image`.
-            if wrapper and isinstance(args[0], cls):
-                return wrapper(cls, args[0], output)  # type: ignore[no-any-return]
-
-            # Inplace `func`'s, canonically identified with a trailing underscore in their name like `.add_(...)`,
-            # will retain the input type. Thus, we need to unwrap here.
-            if isinstance(output, cls):
-                return output.as_subclass(torch.Tensor)  # type: ignore[arg-type]
-
-            return output
-
-    def _make_repr(self, **kwargs: Any) -> str:
-        # This is a poor man's implementation of the proposal in https://github.com/pytorch/pytorch/issues/76532.
-        # If that ever gets implemented, remove this in favor of the solution on the `torch.Tensor` class.
-        extra_repr = ", ".join(f"{key}={value}" for key, value in kwargs.items())
-        return f"{super().__repr__()[:-1]}, {extra_repr})"
-
-    @property
-    def _F(self) -> ModuleType:
-        # This implements a lazy import of the functional to get around the cyclic import. This import is deferred
-        # until the first time we need reference to the functional module and it's shared across all instances of
-        # the class. This approach avoids the DataLoader issue described at
-        # https://github.com/pytorch/vision/pull/6476#discussion_r953588621
-        if _Feature.__F is None:
-            from ..transforms import functional
-
-            _Feature.__F = functional
-        return _Feature.__F
-
-    def horizontal_flip(self) -> _Feature:
-        return self
-
-    def vertical_flip(self) -> _Feature:
-        return self
-
-    # TODO: We have to ignore override mypy error as there is torch.Tensor built-in deprecated op: Tensor.resize
-    # https://github.com/pytorch/pytorch/blob/e8727994eb7cdb2ab642749d6549bc497563aa06/torch/_tensor.py#L588-L593
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> _Feature:
-        return self
-
-    def crop(self, top: int, left: int, height: int, width: int) -> _Feature:
-        return self
-
-    def center_crop(self, output_size: List[int]) -> _Feature:
-        return self
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
-    ) -> _Feature:
-        return self
-
-    def pad(
-        self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
-        padding_mode: str = "constant",
-    ) -> _Feature:
-        return self
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> _Feature:
-        return self
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> _Feature:
-        return self
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
-    ) -> _Feature:
-        return self
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
-    ) -> _Feature:
-        return self
-
-    def adjust_brightness(self, brightness_factor: float) -> _Feature:
-        return self
-
-    def adjust_saturation(self, saturation_factor: float) -> _Feature:
-        return self
-
-    def adjust_contrast(self, contrast_factor: float) -> _Feature:
-        return self
-
-    def adjust_sharpness(self, sharpness_factor: float) -> _Feature:
-        return self
-
-    def adjust_hue(self, hue_factor: float) -> _Feature:
-        return self
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> _Feature:
-        return self
-
-    def posterize(self, bits: int) -> _Feature:
-        return self
-
-    def solarize(self, threshold: float) -> _Feature:
-        return self
-
-    def autocontrast(self) -> _Feature:
-        return self
-
-    def equalize(self) -> _Feature:
-        return self
-
-    def invert(self) -> _Feature:
-        return self
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> _Feature:
-        return self
-
-
-InputType = Union[torch.Tensor, PIL.Image.Image, _Feature]
-InputTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
deleted file mode 100644
index 21126c7f254..00000000000
--- a/torchvision/prototype/features/_image.py
+++ /dev/null
@@ -1,295 +0,0 @@
-from __future__ import annotations
-
-import warnings
-from typing import Any, cast, List, Optional, Tuple, Union
-
-import PIL.Image
-import torch
-from torchvision._utils import StrEnum
-from torchvision.transforms.functional import InterpolationMode, to_pil_image
-from torchvision.utils import draw_bounding_boxes, make_grid
-
-from ._bounding_box import BoundingBox
-from ._feature import _Feature, FillTypeJIT
-
-
-class ColorSpace(StrEnum):
-    OTHER = StrEnum.auto()
-    GRAY = StrEnum.auto()
-    GRAY_ALPHA = StrEnum.auto()
-    RGB = StrEnum.auto()
-    RGB_ALPHA = StrEnum.auto()
-
-    @classmethod
-    def from_pil_mode(cls, mode: str) -> ColorSpace:
-        if mode == "L":
-            return cls.GRAY
-        elif mode == "LA":
-            return cls.GRAY_ALPHA
-        elif mode == "RGB":
-            return cls.RGB
-        elif mode == "RGBA":
-            return cls.RGB_ALPHA
-        else:
-            return cls.OTHER
-
-    @staticmethod
-    def from_tensor_shape(shape: List[int]) -> ColorSpace:
-        return _from_tensor_shape(shape)
-
-
-def _from_tensor_shape(shape: List[int]) -> ColorSpace:
-    # Needed as a standalone method for JIT
-    ndim = len(shape)
-    if ndim < 2:
-        return ColorSpace.OTHER
-    elif ndim == 2:
-        return ColorSpace.GRAY
-
-    num_channels = shape[-3]
-    if num_channels == 1:
-        return ColorSpace.GRAY
-    elif num_channels == 2:
-        return ColorSpace.GRAY_ALPHA
-    elif num_channels == 3:
-        return ColorSpace.RGB
-    elif num_channels == 4:
-        return ColorSpace.RGB_ALPHA
-    else:
-        return ColorSpace.OTHER
-
-
-class Image(_Feature):
-    color_space: ColorSpace
-
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        color_space: Optional[Union[ColorSpace, str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> Image:
-        data = torch.as_tensor(data, dtype=dtype, device=device)  # type: ignore[arg-type]
-        if data.ndim < 2:
-            raise ValueError
-        elif data.ndim == 2:
-            data = data.unsqueeze(0)
-        image = super().__new__(cls, data, requires_grad=requires_grad)
-
-        if color_space is None:
-            color_space = ColorSpace.from_tensor_shape(image.shape)  # type: ignore[arg-type]
-            if color_space == ColorSpace.OTHER:
-                warnings.warn("Unable to guess a specific color space. Consider passing it explicitly.")
-        elif isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-        elif not isinstance(color_space, ColorSpace):
-            raise ValueError
-        image.color_space = color_space
-
-        return image
-
-    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(color_space=self.color_space)
-
-    @classmethod
-    def new_like(
-        cls, other: Image, data: Any, *, color_space: Optional[Union[ColorSpace, str]] = None, **kwargs: Any
-    ) -> Image:
-        return super().new_like(
-            other, data, color_space=color_space if color_space is not None else other.color_space, **kwargs
-        )
-
-    @property
-    def image_size(self) -> Tuple[int, int]:
-        return cast(Tuple[int, int], tuple(self.shape[-2:]))
-
-    @property
-    def num_channels(self) -> int:
-        return self.shape[-3]
-
-    def to_color_space(self, color_space: Union[str, ColorSpace], copy: bool = True) -> Image:
-        if isinstance(color_space, str):
-            color_space = ColorSpace.from_str(color_space.upper())
-
-        return Image.new_like(
-            self,
-            self._F.convert_color_space_image_tensor(
-                self, old_color_space=self.color_space, new_color_space=color_space, copy=copy
-            ),
-            color_space=color_space,
-        )
-
-    def show(self) -> None:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-        to_pil_image(make_grid(self.view(-1, *self.shape[-3:]))).show()
-
-    def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image:
-        # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
-        #  promote this out of the prototype state
-        return Image.new_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs))
-
-    def horizontal_flip(self) -> Image:
-        output = self._F.horizontal_flip_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def vertical_flip(self) -> Image:
-        output = self._F.vertical_flip_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> Image:
-        output = self._F.resize_image_tensor(
-            self, size, interpolation=interpolation, max_size=max_size, antialias=antialias
-        )
-        return Image.new_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Image:
-        output = self._F.crop_image_tensor(self, top, left, height, width)
-        return Image.new_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> Image:
-        output = self._F.center_crop_image_tensor(self, output_size=output_size)
-        return Image.new_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: bool = False,
-    ) -> Image:
-        output = self._F.resized_crop_image_tensor(
-            self, top, left, height, width, size=list(size), interpolation=interpolation, antialias=antialias
-        )
-        return Image.new_like(self, output)
-
-    def pad(
-        self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
-        padding_mode: str = "constant",
-    ) -> Image:
-        output = self._F.pad_image_tensor(self, padding, fill=fill, padding_mode=padding_mode)
-        return Image.new_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Image:
-        output = self._F._geometry.rotate_image_tensor(
-            self, angle, interpolation=interpolation, expand=expand, fill=fill, center=center
-        )
-        return Image.new_like(self, output)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Image:
-        output = self._F._geometry.affine_image_tensor(
-            self,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-        return Image.new_like(self, output)
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
-    ) -> Image:
-        output = self._F._geometry.perspective_image_tensor(
-            self, perspective_coeffs, interpolation=interpolation, fill=fill
-        )
-        return Image.new_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: FillTypeJIT = None,
-    ) -> Image:
-        output = self._F._geometry.elastic_image_tensor(self, displacement, interpolation=interpolation, fill=fill)
-        return Image.new_like(self, output)
-
-    def adjust_brightness(self, brightness_factor: float) -> Image:
-        output = self._F.adjust_brightness_image_tensor(self, brightness_factor=brightness_factor)
-        return Image.new_like(self, output)
-
-    def adjust_saturation(self, saturation_factor: float) -> Image:
-        output = self._F.adjust_saturation_image_tensor(self, saturation_factor=saturation_factor)
-        return Image.new_like(self, output)
-
-    def adjust_contrast(self, contrast_factor: float) -> Image:
-        output = self._F.adjust_contrast_image_tensor(self, contrast_factor=contrast_factor)
-        return Image.new_like(self, output)
-
-    def adjust_sharpness(self, sharpness_factor: float) -> Image:
-        output = self._F.adjust_sharpness_image_tensor(self, sharpness_factor=sharpness_factor)
-        return Image.new_like(self, output)
-
-    def adjust_hue(self, hue_factor: float) -> Image:
-        output = self._F.adjust_hue_image_tensor(self, hue_factor=hue_factor)
-        return Image.new_like(self, output)
-
-    def adjust_gamma(self, gamma: float, gain: float = 1) -> Image:
-        output = self._F.adjust_gamma_image_tensor(self, gamma=gamma, gain=gain)
-        return Image.new_like(self, output)
-
-    def posterize(self, bits: int) -> Image:
-        output = self._F.posterize_image_tensor(self, bits=bits)
-        return Image.new_like(self, output)
-
-    def solarize(self, threshold: float) -> Image:
-        output = self._F.solarize_image_tensor(self, threshold=threshold)
-        return Image.new_like(self, output)
-
-    def autocontrast(self) -> Image:
-        output = self._F.autocontrast_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def equalize(self) -> Image:
-        output = self._F.equalize_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def invert(self) -> Image:
-        output = self._F.invert_image_tensor(self)
-        return Image.new_like(self, output)
-
-    def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image:
-        output = self._F.gaussian_blur_image_tensor(self, kernel_size=kernel_size, sigma=sigma)
-        return Image.new_like(self, output)
-
-
-ImageType = Union[torch.Tensor, PIL.Image.Image, Image]
-ImageTypeJIT = torch.Tensor
-LegacyImageType = Union[torch.Tensor, PIL.Image.Image]
-LegacyImageTypeJIT = torch.Tensor
-TensorImageType = Union[torch.Tensor, Image]
-TensorImageTypeJIT = torch.Tensor
diff --git a/torchvision/prototype/features/_label.py b/torchvision/prototype/features/_label.py
deleted file mode 100644
index ebaa84d66ce..00000000000
--- a/torchvision/prototype/features/_label.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, Optional, Sequence, Type, TypeVar, Union
-
-import torch
-from torch.utils._pytree import tree_map
-
-from ._feature import _Feature
-
-
-L = TypeVar("L", bound="_LabelBase")
-
-
-class _LabelBase(_Feature):
-    categories: Optional[Sequence[str]]
-
-    def __new__(
-        cls: Type[L],
-        data: Any,
-        *,
-        categories: Optional[Sequence[str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> L:
-        label_base = super().__new__(cls, data, dtype=dtype, device=device, requires_grad=requires_grad)
-
-        label_base.categories = categories
-
-        return label_base
-
-    @classmethod
-    def new_like(cls: Type[L], other: L, data: Any, *, categories: Optional[Sequence[str]] = None, **kwargs: Any) -> L:
-        return super().new_like(
-            other, data, categories=categories if categories is not None else other.categories, **kwargs
-        )
-
-    @classmethod
-    def from_category(
-        cls: Type[L],
-        category: str,
-        *,
-        categories: Sequence[str],
-        **kwargs: Any,
-    ) -> L:
-        return cls(categories.index(category), categories=categories, **kwargs)
-
-
-class Label(_LabelBase):
-    def to_categories(self) -> Any:
-        if self.categories is None:
-            raise RuntimeError("Label does not have categories")
-
-        return tree_map(lambda idx: self.categories[idx], self.tolist())
-
-
-class OneHotLabel(_LabelBase):
-    def __new__(
-        cls,
-        data: Any,
-        *,
-        categories: Optional[Sequence[str]] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[Union[torch.device, str, int]] = None,
-        requires_grad: bool = False,
-    ) -> OneHotLabel:
-        one_hot_label = super().__new__(
-            cls, data, categories=categories, dtype=dtype, device=device, requires_grad=requires_grad
-        )
-
-        if categories is not None and len(categories) != one_hot_label.shape[-1]:
-            raise ValueError()
-
-        return one_hot_label
diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
deleted file mode 100644
index 9dd614752a6..00000000000
--- a/torchvision/prototype/features/_mask.py
+++ /dev/null
@@ -1,108 +0,0 @@
-from __future__ import annotations
-
-from typing import List, Optional, Union
-
-import torch
-from torchvision.transforms import InterpolationMode
-
-from ._feature import _Feature, FillTypeJIT
-
-
-class Mask(_Feature):
-    def horizontal_flip(self) -> Mask:
-        output = self._F.horizontal_flip_mask(self)
-        return Mask.new_like(self, output)
-
-    def vertical_flip(self) -> Mask:
-        output = self._F.vertical_flip_mask(self)
-        return Mask.new_like(self, output)
-
-    def resize(  # type: ignore[override]
-        self,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        max_size: Optional[int] = None,
-        antialias: bool = False,
-    ) -> Mask:
-        output = self._F.resize_mask(self, size, max_size=max_size)
-        return Mask.new_like(self, output)
-
-    def crop(self, top: int, left: int, height: int, width: int) -> Mask:
-        output = self._F.crop_mask(self, top, left, height, width)
-        return Mask.new_like(self, output)
-
-    def center_crop(self, output_size: List[int]) -> Mask:
-        output = self._F.center_crop_mask(self, output_size=output_size)
-        return Mask.new_like(self, output)
-
-    def resized_crop(
-        self,
-        top: int,
-        left: int,
-        height: int,
-        width: int,
-        size: List[int],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        antialias: bool = False,
-    ) -> Mask:
-        output = self._F.resized_crop_mask(self, top, left, height, width, size=size)
-        return Mask.new_like(self, output)
-
-    def pad(
-        self,
-        padding: Union[int, List[int]],
-        fill: FillTypeJIT = None,
-        padding_mode: str = "constant",
-    ) -> Mask:
-        output = self._F.pad_mask(self, padding, padding_mode=padding_mode, fill=fill)
-        return Mask.new_like(self, output)
-
-    def rotate(
-        self,
-        angle: float,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Mask:
-        output = self._F.rotate_mask(self, angle, expand=expand, center=center, fill=fill)
-        return Mask.new_like(self, output)
-
-    def affine(
-        self,
-        angle: Union[int, float],
-        translate: List[float],
-        scale: float,
-        shear: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
-        center: Optional[List[float]] = None,
-    ) -> Mask:
-        output = self._F.affine_mask(
-            self,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            fill=fill,
-            center=center,
-        )
-        return Mask.new_like(self, output)
-
-    def perspective(
-        self,
-        perspective_coeffs: List[float],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
-    ) -> Mask:
-        output = self._F.perspective_mask(self, perspective_coeffs, fill=fill)
-        return Mask.new_like(self, output)
-
-    def elastic(
-        self,
-        displacement: torch.Tensor,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: FillTypeJIT = None,
-    ) -> Mask:
-        output = self._F.elastic_mask(self, displacement, fill=fill)
-        return Mask.new_like(self, output, dtype=output.dtype)
diff --git a/torchvision/prototype/models/__init__.py b/torchvision/prototype/models/__init__.py
deleted file mode 100644
index 8b8eda9e9d2..00000000000
--- a/torchvision/prototype/models/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import depth
diff --git a/torchvision/prototype/models/depth/__init__.py b/torchvision/prototype/models/depth/__init__.py
deleted file mode 100644
index 0ff02953c24..00000000000
--- a/torchvision/prototype/models/depth/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import stereo
diff --git a/torchvision/prototype/models/depth/stereo/__init__.py b/torchvision/prototype/models/depth/stereo/__init__.py
deleted file mode 100644
index cd075ca2b9e..00000000000
--- a/torchvision/prototype/models/depth/stereo/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .raft_stereo import *
-from .crestereo import *
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
deleted file mode 100644
index 49643852285..00000000000
--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ /dev/null
@@ -1,1460 +0,0 @@
-import math
-from functools import partial
-from typing import Callable, Dict, Iterable, List, Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.models.optical_flow.raft as raft
-from torch import Tensor
-from torchvision.models._api import register_model, Weights, WeightsEnum
-from torchvision.models._utils import handle_legacy_interface
-from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
-from torchvision.ops import Conv2dNormActivation
-from torchvision.prototype.transforms._presets import StereoMatching
-
-all = (
-    "CREStereo",
-    "CREStereo_Base_Weights",
-    "crestereo_base",
-)
-
-
-class ConvexMaskPredictor(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels: int,
-        hidden_size: int,
-        upsample_factor: int,
-        multiplier: float = 0.25,
-    ) -> None:
-
-        super().__init__()
-        self.mask_head = nn.Sequential(
-            Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3),
-            # https://arxiv.org/pdf/2003.12039.pdf (Annex section B) for the
-            # following convolution output size
-            nn.Conv2d(hidden_size, upsample_factor**2 * 9, 1, padding=0),
-        )
-
-        self.multiplier = multiplier
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.mask_head(x) * self.multiplier
-        return x
-
-
-def get_correlation(
-    left_feature: Tensor,
-    right_feature: Tensor,
-    window_size: Tuple[int, int] = (3, 3),
-    dilate: Tuple[int, int] = (1, 1),
-) -> Tensor:
-    """Function that computes a correlation product between the left and right features.
-
-    The correlation is computed in a sliding window fashion, namely the the left features are fixed
-    and for each ``(i, j)`` location we compute the correlation with a sliding window anchored in
-    ``(i, j)`` from the right feature map. The sliding window selects pixels obtained in the range of the sliding
-    window; i.e ``(i - window_size // 2, i + window_size // 2)`` respectively ``(j - window_size // 2, j + window_size // 2)``.
-    """
-
-    B, C, H, W = left_feature.shape
-
-    di_y, di_x = dilate[0], dilate[1]
-    pad_y, pad_x = window_size[0] // 2 * di_y, window_size[1] // 2 * di_x
-
-    right_padded = F.pad(right_feature, (pad_x, pad_x, pad_y, pad_y), mode="replicate")
-    # in order to vectorize the correlation computation over all pixel candidates
-    # we create multiple shifted right images which we stack on an extra dimension
-    right_padded = F.unfold(right_padded, kernel_size=(H, W), dilation=dilate)
-    # torch unfold returns a tensor of shape [B, flattened_values, n_selections]
-    right_padded = right_padded.permute(0, 2, 1)
-    # we consider rehsape back into [B, n_views, C, H, W]
-    right_padded = right_padded.reshape(B, (window_size[0] * window_size[1]), C, H, W)
-    # we expand the left features for broadcasting
-    left_feature = left_feature.unsqueeze(1)
-    # this will compute an element product of between [B, 1, C, H, W] * [B, n_views, C, H, W]
-    # to obtain correlations over the pixel canditates we perform a mean on the C dimension
-    correlation = torch.mean(left_feature * right_padded, dim=2, keepdim=False)
-    # the final correlation tensor shape will be [B, n_views, H, W]
-    # where on the i-th position of the n_views dimension we will have
-    # the correlation value between the left pixel
-    # and the i-th candidate on the right feature map
-    return correlation
-
-
-def _check_window_specs(
-    search_window_1d: Tuple[int, int] = (1, 9),
-    search_dilate_1d: Tuple[int, int] = (1, 1),
-    search_window_2d: Tuple[int, int] = (3, 3),
-    search_dilate_2d: Tuple[int, int] = (1, 1),
-) -> None:
-
-    if not np.prod(search_window_1d) == np.prod(search_window_2d):
-        raise ValueError(
-            f"The 1D and 2D windows should contain the same number of elements. "
-            f"1D shape: {search_window_1d} 2D shape: {search_window_2d}"
-        )
-    if not np.prod(search_window_1d) % 2 == 1:
-        raise ValueError(
-            f"Search windows should contain an odd number of elements in them."
-            f"Window of shape {search_window_1d} has {np.prod(search_window_1d)} elements."
-        )
-    if not any(size == 1 for size in search_window_1d):
-        raise ValueError(f"The 1D search window should have at least one size equal to 1. 1D shape: {search_window_1d}")
-    if any(size == 1 for size in search_window_2d):
-        raise ValueError(
-            f"The 2D search window should have all dimensions greater than 1. 2D shape: {search_window_2d}"
-        )
-    if any(dilate < 1 for dilate in search_dilate_1d):
-        raise ValueError(
-            f"The 1D search dilation should have all elements equal or greater than 1. 1D shape: {search_dilate_1d}"
-        )
-    if any(dilate < 1 for dilate in search_dilate_2d):
-        raise ValueError(
-            f"The 2D search dilation should have all elements equal greater than 1. 2D shape: {search_dilate_2d}"
-        )
-
-
-class IterativeCorrelationLayer(nn.Module):
-    def __init__(
-        self,
-        groups: int = 4,
-        search_window_1d: Tuple[int, int] = (1, 9),
-        search_dilate_1d: Tuple[int, int] = (1, 1),
-        search_window_2d: Tuple[int, int] = (3, 3),
-        search_dilate_2d: Tuple[int, int] = (1, 1),
-    ) -> None:
-
-        super().__init__()
-        _check_window_specs(
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-        self.search_pixels = np.prod(search_window_1d)
-        self.groups = groups
-
-        # two selection tables for dealing withh the small_patch argument in the forward function
-        self.patch_sizes = {
-            "2d": [search_window_2d for _ in range(self.groups)],
-            "1d": [search_window_1d for _ in range(self.groups)],
-        }
-
-        self.dilate_sizes = {
-            "2d": [search_dilate_2d for _ in range(self.groups)],
-            "1d": [search_dilate_1d for _ in range(self.groups)],
-        }
-
-    def forward(self, left_feature: Tensor, right_feature: Tensor, flow: Tensor, window_type: str = "1d") -> Tensor:
-        """Function that computes 1 pass of non-offsetted Group-Wise correlation"""
-        coords = make_coords_grid(
-            left_feature.shape[0], left_feature.shape[2], left_feature.shape[3], device=str(left_feature.device)
-        )
-
-        # we offset the coordinate grid in the flow direction
-        coords = coords + flow
-        coords = coords.permute(0, 2, 3, 1)
-        # resample right features according to off-setted grid
-        right_feature = grid_sample(right_feature, coords, mode="bilinear", align_corners=True)
-
-        # use_small_patch is a flag by which we decide on how many axes
-        # we perform candidate search. See section 3.1 ``Deformable search window`` & Figure 4 in the paper.
-        patch_size_list = self.patch_sizes[window_type]
-        dilate_size_list = self.dilate_sizes[window_type]
-
-        # chunking the left and right feature to perform group-wise correlation
-        # mechanism simillar to GroupNorm. See section 3.1 ``Group-wise correlation``.
-        left_groups = torch.chunk(left_feature, self.groups, dim=1)
-        right_groups = torch.chunk(right_feature, self.groups, dim=1)
-
-        correlations = []
-        # this boils down to rather than performing the correlation product
-        # over the entire C dimensions, we use subsets of C to get multiple correlation sets
-        for i in range(len(patch_size_list)):
-            correlation = get_correlation(left_groups[i], right_groups[i], patch_size_list[i], dilate_size_list[i])
-            correlations.append(correlation)
-        final_correlations = torch.cat(correlations, dim=1)
-        return final_correlations
-
-
-class AttentionOffsetCorrelationLayer(nn.Module):
-    def __init__(
-        self,
-        groups: int = 4,
-        attention_module: Optional[nn.Module] = None,
-        search_window_1d: Tuple[int, int] = (1, 9),
-        search_dilate_1d: Tuple[int, int] = (1, 1),
-        search_window_2d: Tuple[int, int] = (3, 3),
-        search_dilate_2d: Tuple[int, int] = (1, 1),
-    ) -> None:
-        super().__init__()
-        _check_window_specs(
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-        # convert to python scalar
-        self.search_pixels = int(np.prod(search_window_1d))
-        self.groups = groups
-
-        # two selection tables for dealing withh the small_patch argument in the forward function
-        self.patch_sizes = {
-            "2d": [search_window_2d for _ in range(self.groups)],
-            "1d": [search_window_1d for _ in range(self.groups)],
-        }
-
-        self.dilate_sizes = {
-            "2d": [search_dilate_2d for _ in range(self.groups)],
-            "1d": [search_dilate_1d for _ in range(self.groups)],
-        }
-
-        self.attention_module = attention_module
-
-    def forward(
-        self,
-        left_feature: Tensor,
-        right_feature: Tensor,
-        flow: Tensor,
-        extra_offset: Tensor,
-        window_type: str = "1d",
-    ) -> Tensor:
-        """Function that computes 1 pass of offsetted Group-Wise correlation
-
-        If the class was provided with an attention layer, the left and right feature maps
-        will be passed through a transformer first
-        """
-        B, C, H, W = left_feature.shape
-
-        if self.attention_module is not None:
-            # prepare for transformer required input shapes
-            left_feature = left_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
-            right_feature = right_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
-            # this can be either self attention or cross attention, hence the tupple return
-            left_feature, right_feature = self.attention_module(left_feature, right_feature)
-            left_feature = left_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
-            right_feature = right_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
-
-        left_groups = torch.chunk(left_feature, self.groups, dim=1)
-        right_groups = torch.chunk(right_feature, self.groups, dim=1)
-
-        num_search_candidates = self.search_pixels
-        # for each pixel (i, j) we have a number of search candidates
-        # thus, for each candidate we should have an X-axis and Y-axis offset value
-        extra_offset = extra_offset.reshape(B, num_search_candidates, 2, H, W).permute(0, 1, 3, 4, 2)
-
-        patch_size_list = self.patch_sizes[window_type]
-        dilate_size_list = self.dilate_sizes[window_type]
-
-        group_channels = C // self.groups
-        correlations = []
-
-        for i in range(len(patch_size_list)):
-            left_group, right_group = left_groups[i], right_groups[i]
-            patch_size, dilate = patch_size_list[i], dilate_size_list[i]
-
-            di_y, di_x = dilate
-            ps_y, ps_x = patch_size
-            # define the search based on the window patch shape
-            ry, rx = ps_y // 2 * di_y, ps_x // 2 * di_x
-
-            # base offsets for search (i.e. where to look on the search index)
-            x_grid, y_grid = torch.meshgrid(
-                torch.arange(-rx, rx + 1, di_x), torch.arange(-ry, ry + 1, di_y), indexing="xy"
-            )
-            x_grid, y_grid = x_grid.to(flow.device), y_grid.to(flow.device)
-            offsets = torch.stack((x_grid, y_grid))
-            offsets = offsets.reshape(2, -1).permute(1, 0)
-
-            for d in (0, 2, 3):
-                offsets = offsets.unsqueeze(d)
-            # extra offsets for search (i.e. deformed search indexes. Simillar concept to deformable convolutions)
-            offsets = offsets + extra_offset
-
-            coords = (
-                make_coords_grid(
-                    left_feature.shape[0], left_feature.shape[2], left_feature.shape[3], device=str(left_feature.device)
-                )
-                + flow
-            )
-            coords = coords.permute(0, 2, 3, 1).unsqueeze(1)
-            coords = coords + offsets
-            coords = coords.reshape(B, -1, W, 2)
-
-            right_group = grid_sample(right_group, coords, mode="bilinear", align_corners=True)
-            # we do not need to perform any window shifting because the grid sample op
-            # will return a multi-view right based on the num_search_candidates dimension in the offsets
-            right_group = right_group.reshape(B, group_channels, -1, H, W)
-            left_group = left_group.reshape(B, group_channels, -1, H, W)
-            correlation = torch.mean(left_group * right_group, dim=1)
-            correlations.append(correlation)
-
-        final_correlation = torch.cat(correlations, dim=1)
-        return final_correlation
-
-
-class AdaptiveGroupCorrelationLayer(nn.Module):
-    """
-    Container for computing various correlation types between a left and right feature map.
-    This module does not contain any optimisable parameters, it's solely a collection of ops.
-    We wrap in a nn.Module for torch.jit.script compatibility
-
-    Adaptive Group Correlation operations from: https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf
-
-    Canonical reference implementation: https://github.com/megvii-research/CREStereo/blob/master/nets/corr.py
-    """
-
-    def __init__(
-        self,
-        iterative_correlation_layer: IterativeCorrelationLayer,
-        attention_offset_correlation_layer: AttentionOffsetCorrelationLayer,
-    ) -> None:
-        super().__init__()
-
-        self.iterative_correlation_layer = iterative_correlation_layer
-        self.attention_offset_correlation_layer = attention_offset_correlation_layer
-
-    def forward(
-        self,
-        left_features: Tensor,
-        right_features: Tensor,
-        flow: torch.Tensor,
-        extra_offset: Optional[Tensor],
-        window_type: str = "1d",
-        iter_mode: bool = False,
-    ) -> Tensor:
-        if iter_mode or extra_offset is None:
-            corr = self.iterative_correlation_layer(left_features, right_features, flow, window_type)
-        else:
-            corr = self.attention_offset_correlation_layer(
-                left_features, right_features, flow, extra_offset, window_type
-            )  # type: ignore
-        return corr
-
-
-def elu_feature_map(x: Tensor) -> Tensor:
-    """Elu feature map operation from: https://arxiv.org/pdf/2006.16236.pdf"""
-    return F.elu(x) + 1
-
-
-class LinearAttention(nn.Module):
-    """
-    Linear attention operation from: https://arxiv.org/pdf/2006.16236.pdf
-    Cannonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
-    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
-    """
-
-    def __init__(self, eps: float = 1e-6, feature_map_fn: Callable[[Tensor], Tensor] = elu_feature_map) -> None:
-        super().__init__()
-        self.eps = eps
-        self.feature_map_fn = feature_map_fn
-
-    def forward(
-        self,
-        queries: Tensor,
-        keys: Tensor,
-        values: Tensor,
-        q_mask: Optional[Tensor] = None,
-        kv_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        """
-        Args:
-            queries (torch.Tensor): [N, S1, H, D]
-            keys (torch.Tensor): [N, S2, H, D]
-            values (torch.Tensor): [N, S2, H, D]
-            q_mask (torch.Tensor): [N, S1] (optional)
-            kv_mask (torch.Tensor): [N, S2] (optional)
-        Returns:
-            queried_values (torch.Tensor): [N, S1, H, D]
-        """
-        queries = self.feature_map_fn(queries)
-        keys = self.feature_map_fn(keys)
-
-        if q_mask is not None:
-            queries = queries * q_mask[:, :, None, None]
-        if kv_mask is not None:
-            keys = keys * kv_mask[:, :, None, None]
-            values = values * kv_mask[:, :, None, None]
-
-        # mitigates fp16 overflows
-        values_length = values.shape[1]
-        values = values / values_length
-        kv = torch.einsum("NSHD, NSHV -> NHDV", keys, values)
-        z = 1 / (torch.einsum("NLHD, NHD -> NLH", queries, keys.sum(dim=1)) + self.eps)
-        # rescale at the end to account for fp16 mitigation
-        queried_values = torch.einsum("NLHD, NHDV, NLH -> NLHV", queries, kv, z) * values_length
-        return queried_values
-
-
-class SoftmaxAttention(nn.Module):
-    """
-    A simple softmax attention  operation
-    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
-    """
-
-    def __init__(self, dropout: float = 0.0) -> None:
-        super().__init__()
-        self.dropout = nn.Dropout(dropout) if dropout else nn.Identity()
-
-    def forward(
-        self,
-        queries: Tensor,
-        keys: Tensor,
-        values: Tensor,
-        q_mask: Optional[Tensor] = None,
-        kv_mask: Optional[Tensor] = None,
-    ) -> Tensor:
-        """
-        Computes classical softmax full-attention between all queries and keys.
-
-        Args:
-            queries (torch.Tensor): [N, S1, H, D]
-            keys (torch.Tensor): [N, S2, H, D]
-            values (torch.Tensor): [N, S2, H, D]
-            q_mask (torch.Tensor): [N, S1] (optional)
-            kv_mask (torch.Tensor): [N, S2] (optional)
-        Returns:
-            queried_values: [N, S1, H, D]
-        """
-
-        scale_factor = 1.0 / queries.shape[3] ** 0.5  # irsqrt(D) scaling
-        queries = queries * scale_factor
-
-        qk = torch.einsum("NLHD, NSHD -> NLSH", queries, keys)
-        if kv_mask is not None and q_mask is not None:
-            qk.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float("-inf"))
-
-        attention = torch.softmax(qk, dim=2)
-        attention = self.dropout(attention)
-
-        queried_values = torch.einsum("NLSH, NSHD -> NLHD", attention, values)
-        return queried_values
-
-
-class PositionalEncodingSine(nn.Module):
-    """
-    Sinusoidal positonal encodings
-
-    Using the scaling term from https://github.com/megvii-research/CREStereo/blob/master/nets/attention/position_encoding.py
-    Reference implementation from https://github.com/facebookresearch/detr/blob/8a144f83a287f4d3fece4acdf073f387c5af387d/models/position_encoding.py#L28-L48
-    """
-
-    def __init__(self, dim_model: int, max_size: int = 256) -> None:
-        super().__init__()
-        self.dim_model = dim_model
-        self.max_size = max_size
-        # pre-registered for memory efficiency during forward pass
-        pe = self._make_pe_of_size(self.max_size)
-        self.register_buffer("pe", pe)
-
-    def _make_pe_of_size(self, size: int) -> Tensor:
-        pe = torch.zeros((self.dim_model, *(size, size)), dtype=torch.float32)
-        y_positions = torch.ones((size, size)).cumsum(0).float().unsqueeze(0)
-        x_positions = torch.ones((size, size)).cumsum(1).float().unsqueeze(0)
-        div_term = torch.exp(torch.arange(0.0, self.dim_model // 2, 2) * (-math.log(10000.0) / self.dim_model // 2))
-        div_term = div_term[:, None, None]
-        pe[0::4, :, :] = torch.sin(x_positions * div_term)
-        pe[1::4, :, :] = torch.cos(x_positions * div_term)
-        pe[2::4, :, :] = torch.sin(y_positions * div_term)
-        pe[3::4, :, :] = torch.cos(y_positions * div_term)
-        pe = pe.unsqueeze(0)
-        return pe
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x: [B, C, H, W]
-
-        Returns:
-            x: [B, C, H, W]
-        """
-        torch._assert(
-            len(x.shape) == 4,
-            f"PositionalEncodingSine requires a 4-D dimensional input. Provided tensor is of shape {x.shape}",
-        )
-
-        B, C, H, W = x.shape
-        return x + self.pe[:, :, :H, :W]  # type: ignore
-
-
-class LocalFeatureEncoderLayer(nn.Module):
-    """
-    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
-    """
-
-    def __init__(
-        self,
-        *,
-        dim_model: int,
-        num_heads: int,
-        attention_module: Callable[..., nn.Module] = LinearAttention,
-    ) -> None:
-        super().__init__()
-
-        self.attention_op = attention_module()
-
-        if not isinstance(self.attention_op, (LinearAttention, SoftmaxAttention)):
-            raise ValueError(
-                f"attention_module must be an instance of LinearAttention or SoftmaxAttention. Got {type(self.attention_op)}"
-            )
-
-        self.dim_head = dim_model // num_heads
-        self.num_heads = num_heads
-
-        # multi-head attention
-        self.query_proj = nn.Linear(dim_model, dim_model, bias=False)
-        self.key_proj = nn.Linear(dim_model, dim_model, bias=False)
-        self.value_proj = nn.Linear(dim_model, dim_model, bias=False)
-        self.merge = nn.Linear(dim_model, dim_model, bias=False)
-
-        # feed forward network
-        self.ffn = nn.Sequential(
-            nn.Linear(dim_model * 2, dim_model * 2, bias=False),
-            nn.ReLU(),
-            nn.Linear(dim_model * 2, dim_model, bias=False),
-        )
-
-        # norm layers
-        self.attention_norm = nn.LayerNorm(dim_model)
-        self.ffn_norm = nn.LayerNorm(dim_model)
-
-    def forward(
-        self, x: Tensor, source: Tensor, x_mask: Optional[Tensor] = None, source_mask: Optional[Tensor] = None
-    ) -> Tensor:
-        """
-        Args:
-            x (torch.Tensor): [B, S1, D]
-            source (torch.Tensor): [B, S2, D]
-            x_mask (torch.Tensor): [B, S1] (optional)
-            source_mask (torch.Tensor): [B, S2] (optional)
-        """
-        B, S, D = x.shape
-        queries, keys, values = x, source, source
-
-        queries = self.query_proj(queries).reshape(B, S, self.num_heads, self.dim_head)
-        keys = self.key_proj(keys).reshape(B, S, self.num_heads, self.dim_head)
-        values = self.value_proj(values).reshape(B, S, self.num_heads, self.dim_head)
-
-        # attention operation
-        message = self.attention_op(queries, keys, values, x_mask, source_mask)
-        # concatenating attention heads together before passing throught projection layer
-        message = self.merge(message.reshape(B, S, D))
-        message = self.attention_norm(message)
-
-        # ffn operation
-        message = self.ffn(torch.cat([x, message], dim=2))
-        message = self.ffn_norm(message)
-
-        return x + message
-
-
-class LocalFeatureTransformer(nn.Module):
-    """
-    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
-    """
-
-    def __init__(
-        self,
-        *,
-        dim_model: int,
-        num_heads: int,
-        attention_directions: List[str],
-        attention_module: Callable[..., nn.Module] = LinearAttention,
-    ) -> None:
-        super(LocalFeatureTransformer, self).__init__()
-
-        self.attention_module = attention_module
-        self.attention_directions = attention_directions
-        for direction in attention_directions:
-            if direction not in ["self", "cross"]:
-                raise ValueError(
-                    f"Attention direction {direction} unsupported. LocalFeatureTransformer accepts only ``attention_type`` in ``[self, cross]``."
-                )
-
-        self.layers = nn.ModuleList(
-            [
-                LocalFeatureEncoderLayer(dim_model=dim_model, num_heads=num_heads, attention_module=attention_module)
-                for _ in attention_directions
-            ]
-        )
-
-    def forward(
-        self,
-        left_features: Tensor,
-        right_features: Tensor,
-        left_mask: Optional[Tensor] = None,
-        right_mask: Optional[Tensor] = None,
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Args:
-            left_features (torch.Tensor): [N, S1, D]
-            right_features (torch.Tensor): [N, S2, D]
-            left_mask (torch.Tensor): [N, S1] (optional)
-            right_mask (torch.Tensor): [N, S2] (optional)
-        Returns:
-            left_features (torch.Tensor): [N, S1, D]
-            right_features (torch.Tensor): [N, S2, D]
-        """
-
-        torch._assert(
-            left_features.shape[2] == right_features.shape[2],
-            f"left_features and right_features should have the same embedding dimensions. left_features: {left_features.shape[2]} right_features: {right_features.shape[2]}",
-        )
-
-        for idx, layer in enumerate(self.layers):
-            attention_direction = self.attention_directions[idx]
-
-            if attention_direction == "self":
-                left_features = layer(left_features, left_features, left_mask, left_mask)
-                right_features = layer(right_features, right_features, right_mask, right_mask)
-
-            elif attention_direction == "cross":
-                left_features = layer(left_features, right_features, left_mask, right_mask)
-                right_features = layer(right_features, left_features, right_mask, left_mask)
-
-        return left_features, right_features
-
-
-class PyramidDownsample(nn.Module):
-    """
-    A simple wrapper that return and Avg Pool feature pyramid based on the provided scales.
-    Implicitly returns the input as well.
-    """
-
-    def __init__(self, factors: Iterable[int]) -> None:
-        super().__init__()
-        self.factors = factors
-
-    def forward(self, x: torch.Tensor) -> List[Tensor]:
-        results = [x]
-        for factor in self.factors:
-            results.append(F.avg_pool2d(x, kernel_size=factor, stride=factor))
-        return results
-
-
-class CREStereo(nn.Module):
-    """
-    Implements CREStereo from the `"Practical Stereo Matching via Cascaded Recurrent Network
-    With Adaptive Correlation" <https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf>`_ paper.
-    Args:
-        feature_encoder (raft.FeatureEncoder): Raft-like Feature Encoder module extract low-level features from inputs.
-        update_block (raft.UpdateBlock): Raft-like Update Block which recursively refines a flow-map.
-        flow_head (raft.FlowHead): Raft-like Flow Head which predics a flow-map from some inputs.
-        self_attn_block (LocalFeatureTransformer): A Local Feature Transformer that performs self attention on the two feature maps.
-        cross_attn_block (LocalFeatureTransformer): A Local Feature Transformer that performs cross attention between the two feature maps
-            used in the Adaptive Group Correlation module.
-        feature_downsample_rates (List[int]): The downsample rates used to build a feature pyramid from the outputs of the `feature_encoder`. Default: [2, 4]
-        correlation_groups (int): In how many groups should the features be split when computer per-pixel correlation. Defaults 4.
-        search_window_1d (Tuple[int, int]): The alternate search window size in the x and y directions for the 1D case. Defaults to (1, 9).
-        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Simillar to `nn.Conv2d` dilate. Defaults to (1, 1).
-        search_window_2d (Tuple[int, int]): The alternate search window size in the x and y directions for the 2D case. Defaults to (3, 3).
-        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Simillar to `nn.Conv2d` dilate. Defaults to (1, 1).
-    """
-
-    def __init__(
-        self,
-        *,
-        feature_encoder: raft.FeatureEncoder,
-        update_block: raft.UpdateBlock,
-        flow_head: raft.FlowHead,
-        self_attn_block: LocalFeatureTransformer,
-        cross_attn_block: LocalFeatureTransformer,
-        feature_downsample_rates: Tuple[int, ...] = (2, 4),
-        correlation_groups: int = 4,
-        search_window_1d: Tuple[int, int] = (1, 9),
-        search_dilate_1d: Tuple[int, int] = (1, 1),
-        search_window_2d: Tuple[int, int] = (3, 3),
-        search_dilate_2d: Tuple[int, int] = (1, 1),
-    ) -> None:
-        super().__init__()
-        self.output_channels = 2
-
-        self.feature_encoder = feature_encoder
-        self.update_block = update_block
-        self.flow_head = flow_head
-        self.self_attn_block = self_attn_block
-
-        # average pooling for the feature encoder outputs
-        self.downsampling_pyramid = PyramidDownsample(feature_downsample_rates)
-        self.downsampling_factors: List[int] = [feature_encoder.downsample_factor]
-        base_downsample_factor: int = self.downsampling_factors[0]
-        for rate in feature_downsample_rates:
-            self.downsampling_factors.append(base_downsample_factor * rate)
-
-        # output resolution tracking
-        self.resolutions: List[str] = [f"1 / {factor}" for factor in self.downsampling_factors]
-        self.search_pixels = int(np.prod(search_window_1d))
-
-        # flow convex upsampling mask predictor
-        self.mask_predictor = ConvexMaskPredictor(
-            in_channels=feature_encoder.output_dim // 2,
-            hidden_size=feature_encoder.output_dim,
-            upsample_factor=feature_encoder.downsample_factor,
-            multiplier=0.25,
-        )
-
-        # offsets modules for offseted feature selection
-        self.offset_convs = nn.ModuleDict()
-        self.correlation_layers = nn.ModuleDict()
-
-        offset_conv_layer = partial(
-            Conv2dNormActivation,
-            in_channels=feature_encoder.output_dim,
-            out_channels=self.search_pixels * 2,
-            norm_layer=None,
-            activation_layer=None,
-        )
-
-        # populate the dicts in top to bottom order
-        # useful for iterating through torch.jit.script module given the network forward pass
-        #
-        # Ignore the largest resolution. We handle that separately due to torch.jit.script
-        # not being to able access to runtime generated keys in ModuleDicts.
-        # This way, we can keep a generic way of processing all pyramid levels but except
-        # the final one
-        iterative_correlation_layer = partial(
-            IterativeCorrelationLayer,
-            groups=correlation_groups,
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-
-        attention_offset_correlation_layer = partial(
-            AttentionOffsetCorrelationLayer,
-            groups=correlation_groups,
-            search_window_1d=search_window_1d,
-            search_dilate_1d=search_dilate_1d,
-            search_window_2d=search_window_2d,
-            search_dilate_2d=search_dilate_2d,
-        )
-
-        for idx, resolution in enumerate(reversed(self.resolutions[1:])):
-            # the largest resolution does use offset convolutions for sampling grid coords
-            offset_conv = None if idx == len(self.resolutions) - 1 else offset_conv_layer()
-            if offset_conv:
-                self.offset_convs[resolution] = offset_conv
-                # only the lowest resolution uses the cross attention module when computing correlation scores
-                attention_module = cross_attn_block if idx == 0 else None
-                self.correlation_layers[resolution] = AdaptiveGroupCorrelationLayer(
-                    iterative_correlation_layer=iterative_correlation_layer(),
-                    attention_offset_correlation_layer=attention_offset_correlation_layer(
-                        attention_module=attention_module
-                    ),
-                )
-
-        # correlation layer for the largest resolution
-        self.max_res_correlation_layer = AdaptiveGroupCorrelationLayer(
-            iterative_correlation_layer=iterative_correlation_layer(),
-            attention_offset_correlation_layer=attention_offset_correlation_layer(),
-        )
-
-        # simple 2D Postional Encodings
-        self.positional_encodings = PositionalEncodingSine(feature_encoder.output_dim)
-
-    def _get_window_type(self, iteration: int) -> str:
-        return "1d" if iteration % 2 == 0 else "2d"
-
-    def forward(
-        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor], num_iters: int = 10
-    ) -> List[Tensor]:
-        features = torch.cat([left_image, right_image], dim=0)
-        features = self.feature_encoder(features)
-        left_features, right_features = features.chunk(2, dim=0)
-
-        # update block network state and input context are derived from the left feature map
-        net, ctx = left_features.chunk(2, dim=1)
-        net = torch.tanh(net)
-        ctx = torch.relu(ctx)
-
-        # will output lists of tensor.
-        l_pyramid = self.downsampling_pyramid(left_features)
-        r_pyramid = self.downsampling_pyramid(right_features)
-        net_pyramid = self.downsampling_pyramid(net)
-        ctx_pyramid = self.downsampling_pyramid(ctx)
-
-        # we store in reversed order because we process the pyramid from top to bottom
-        l_pyramid: Dict[str, Tensor] = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        r_pyramid: Dict[str, Tensor] = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        net_pyramid: Dict[str, Tensor] = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-        ctx_pyramid: Dict[str, Tensor] = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)}
-
-        # offsets for sampling pixel candidates in the correlation ops
-        offsets: Dict[str, Tensor] = {}
-        for resolution, offset_conv in self.offset_convs.items():
-            feature_map = l_pyramid[resolution]
-            offset = offset_conv(feature_map)
-            offsets[resolution] = (torch.sigmoid(offset) - 0.5) * 2.0
-
-        # the smallest resolution is prepared for passing through self attention
-        min_res = self.resolutions[-1]
-        max_res = self.resolutions[0]
-
-        B, C, MIN_H, MIN_W = l_pyramid[min_res].shape
-        # add positional encodings
-        l_pyramid[min_res] = self.positional_encodings(l_pyramid[min_res])
-        r_pyramid[min_res] = self.positional_encodings(r_pyramid[min_res])
-        # reshaping for transformer
-        l_pyramid[min_res] = l_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
-        r_pyramid[min_res] = r_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
-        # perform self attention
-        l_pyramid[min_res], r_pyramid[min_res] = self.self_attn_block(l_pyramid[min_res], r_pyramid[min_res])
-        # now we need to reshape back into [B, C, H, W] format
-        l_pyramid[min_res] = l_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
-        r_pyramid[min_res] = r_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
-
-        predictions: List[Tensor] = []
-        flow_estimates: Dict[str, Tensor] = {}
-        # we added this because of torch.script.jit
-        # also, the predicition prior is always going to have the
-        # spatial size of the features outputed by the feature encoder
-        flow_pred_prior: Tensor = torch.empty(
-            size=(B, 2, left_features.shape[2], left_features.shape[3]),
-            dtype=l_pyramid[max_res].dtype,
-            device=l_pyramid[max_res].device,
-        )
-
-        if flow_init is not None:
-            scale = l_pyramid[max_res].shape[2] / flow_init.shape[2]
-            # in CREStereo implementation they multiply with -scale instead of scale
-            # this can be either a downsample or an upsample based on the cascaded inference
-            # configuration
-
-            # we use a -scale because the flow used inside the network is a negative flow
-            # from the right to the left, so we flip the flow direction
-            flow_estimates[max_res] = -scale * F.interpolate(
-                input=flow_init,
-                size=l_pyramid[max_res].shape[2:],
-                mode="bilinear",
-                align_corners=True,
-            )
-
-        # when not provided with a flow prior, we construct one using the lower resolution maps
-        else:
-            # initialize a zero flow with the smallest resolution
-            flow = torch.zeros(size=(B, 2, MIN_H, MIN_W), device=left_features.device, dtype=left_features.dtype)
-
-            # flows from coarse resolutions are refined similarly
-            # we always need to fetch the next pyramid feature map as well
-            # when updating coarse resolutions, therefore we create a reversed
-            # view which has its order synced with the ModuleDict keys iterator
-            coarse_resolutions: List[str] = self.resolutions[::-1]  # using slicing because of torch.jit.script
-            fine_grained_resolution = max_res
-
-            # set the coarsest flow to the zero flow
-            flow_estimates[coarse_resolutions[0]] = flow
-
-            # the correlation layer ModuleDict will contain layers ordered from coarse to fine resolution
-            # i.e ["1 / 16", "1 / 8", "1 / 4"]
-            # the correlation layer ModuleDict has layers for all the resolutions except the fine one
-            # i.e {"1 / 16": Module, "1 / 8": Module}
-            # for these resolution we perform only half of the number of refinement iterations
-            for idx, (resolution, correlation_layer) in enumerate(self.correlation_layers.items()):
-                # compute the scale difference between the first pyramid scale and the current pyramid scale
-                scale_to_base = l_pyramid[fine_grained_resolution].shape[2] // l_pyramid[resolution].shape[2]
-                for it in range(num_iters // 2):
-                    # set wether or not we want to search on (X, Y) axes for correlation or just on X axis
-                    window_type = self._get_window_type(it)
-                    # we consider this a prior, therefor we do not want to back-propagate through it
-                    flow_estimates[resolution] = flow_estimates[resolution].detach()
-
-                    correlations = correlation_layer(
-                        l_pyramid[resolution],  # left
-                        r_pyramid[resolution],  # right
-                        flow_estimates[resolution],
-                        offsets[resolution],
-                        window_type,
-                    )
-
-                    # update the recurrent network state and the flow deltas
-                    net_pyramid[resolution], delta_flow = self.update_block(
-                        net_pyramid[resolution], ctx_pyramid[resolution], correlations, flow_estimates[resolution]
-                    )
-
-                    # the convex upsampling weights are computed w.r.t.
-                    # the recurrent update state
-                    up_mask = self.mask_predictor(net_pyramid[resolution])
-                    flow_estimates[resolution] = flow_estimates[resolution] + delta_flow
-                    # convex upsampling with the initial feature encoder downsampling rate
-                    flow_pred_prior = upsample_flow(
-                        flow_estimates[resolution], up_mask, factor=self.downsampling_factors[0]
-                    )
-                    # we then bilinear upsample to the final resolution
-                    # we use a factor that's equivalent to the difference between
-                    # the current downsample resolution and the base downsample resolution
-                    #
-                    # i.e. if a 1 / 16 flow is upsampled by 4 (base downsampling) we get a 1 / 4 flow.
-                    # therefore we have to further upscale it by the difference between
-                    # the current level 1 / 16 and the base level 1 / 4.
-                    #
-                    # we use a -scale because the flow used inside the network is a negative flow
-                    # from the right to the left, so we flip the flow direction in order to get the
-                    # left to right flow
-                    flow_pred = -upsample_flow(flow_pred_prior, None, factor=scale_to_base)
-                    predictions.append(flow_pred)
-
-                # when constructing the next resolution prior, we resample w.r.t
-                # to the scale of the next level in the pyramid
-                next_resolution = coarse_resolutions[idx + 1]
-                scale_to_next = l_pyramid[next_resolution].shape[2] / flow_pred_prior.shape[2]
-                # we use the flow_up_prior because this is a more accurate estimation of the true flow
-                # due to the convex upsample, which resembles a learned super-resolution module.
-                # this is not necessarily an upsample, it can be a downsample, based on the provided configuration
-                flow_estimates[next_resolution] = -scale_to_next * F.interpolate(
-                    input=flow_pred_prior,
-                    size=l_pyramid[next_resolution].shape[2:],
-                    mode="bilinear",
-                    align_corners=True,
-                )
-
-        # finally we will be doing a full pass through the fine-grained resolution
-        # this coincides with the maximum resolution
-
-        # we keep a separate loop here in order to avoid python control flow
-        # to decide how much iterations should we do based on the current resolution
-        # further more, if provided with an inital flow, there is no need to generate
-        # a prior estimate when moving into the final refinement stage
-
-        for it in range(num_iters):
-            search_window_type = self._get_window_type(it)
-
-            flow_estimates[max_res] = flow_estimates[max_res].detach()
-            # we run the fine-grained resolution correlations in iterative mode
-            # this means that we are using the fixed window pixel selections
-            # instead of the deformed ones as with the previous steps
-            correlations = self.max_res_correlation_layer(
-                l_pyramid[max_res],
-                r_pyramid[max_res],
-                flow_estimates[max_res],
-                extra_offset=None,
-                window_type=search_window_type,
-                iter_mode=True,
-            )
-
-            net_pyramid[max_res], delta_flow = self.update_block(
-                net_pyramid[max_res], ctx_pyramid[max_res], correlations, flow_estimates[max_res]
-            )
-
-            up_mask = self.mask_predictor(net_pyramid[max_res])
-            flow_estimates[max_res] = flow_estimates[max_res] + delta_flow
-            # at the final resolution we simply do a convex upsample using the base downsample rate
-            flow_pred = -upsample_flow(flow_estimates[max_res], up_mask, factor=self.downsampling_factors[0])
-            predictions.append(flow_pred)
-
-        return predictions
-
-
-def _crestereo(
-    *,
-    weights: Optional[WeightsEnum],
-    progress: bool,
-    # Feature Encoder
-    feature_encoder_layers: Tuple[int, int, int, int, int],
-    feature_encoder_strides: Tuple[int, int, int, int],
-    feature_encoder_block: Callable[..., nn.Module],
-    feature_encoder_norm_layer: Callable[..., nn.Module],
-    # Average Pooling Pyramid
-    feature_downsample_rates: Tuple[int, ...],
-    # Adaptive Correlation Layer
-    corr_groups: int,
-    corr_search_window_2d: Tuple[int, int],
-    corr_search_dilate_2d: Tuple[int, int],
-    corr_search_window_1d: Tuple[int, int],
-    corr_search_dilate_1d: Tuple[int, int],
-    # Flow head
-    flow_head_hidden_size: int,
-    # Recurrent block
-    recurrent_block_hidden_state_size: int,
-    recurrent_block_kernel_size: Tuple[Tuple[int, int], Tuple[int, int]],
-    recurrent_block_padding: Tuple[Tuple[int, int], Tuple[int, int]],
-    # Motion Encoder
-    motion_encoder_corr_layers: Tuple[int, int],
-    motion_encoder_flow_layers: Tuple[int, int],
-    motion_encoder_out_channels: int,
-    # Transformer Blocks
-    num_attention_heads: int,
-    num_self_attention_layers: int,
-    num_cross_attention_layers: int,
-    self_attention_module: Callable[..., nn.Module],
-    cross_attention_module: Callable[..., nn.Module],
-    **kwargs,
-) -> CREStereo:
-
-    feature_encoder = kwargs.pop("feature_encoder", None) or raft.FeatureEncoder(
-        block=feature_encoder_block,
-        layers=feature_encoder_layers,
-        strides=feature_encoder_strides,
-        norm_layer=feature_encoder_norm_layer,
-    )
-
-    if feature_encoder.output_dim % corr_groups != 0:
-        raise ValueError(
-            f"Final ``feature_encoder_layers`` size should be divisible by ``corr_groups`` argument."
-            f"Feature encoder output size : {feature_encoder.output_dim}, Correlation groups: {corr_groups}."
-        )
-
-    motion_encoder = kwargs.pop("motion_encoder", None) or raft.MotionEncoder(
-        in_channels_corr=corr_groups * int(np.prod(corr_search_window_1d)),
-        corr_layers=motion_encoder_corr_layers,
-        flow_layers=motion_encoder_flow_layers,
-        out_channels=motion_encoder_out_channels,
-    )
-
-    out_channels_context = feature_encoder_layers[-1] - recurrent_block_hidden_state_size
-    recurrent_block = kwargs.pop("recurrent_block", None) or raft.RecurrentBlock(
-        input_size=motion_encoder.out_channels + out_channels_context,
-        hidden_size=recurrent_block_hidden_state_size,
-        kernel_size=recurrent_block_kernel_size,
-        padding=recurrent_block_padding,
-    )
-
-    flow_head = kwargs.pop("flow_head", None) or raft.FlowHead(
-        in_channels=out_channels_context, hidden_size=flow_head_hidden_size
-    )
-
-    update_block = raft.UpdateBlock(motion_encoder=motion_encoder, recurrent_block=recurrent_block, flow_head=flow_head)
-
-    self_attention_module = kwargs.pop("self_attention_module", None) or LinearAttention
-    self_attn_block = LocalFeatureTransformer(
-        dim_model=feature_encoder.output_dim,
-        num_heads=num_attention_heads,
-        attention_directions=["self"] * num_self_attention_layers,
-        attention_module=self_attention_module,
-    )
-
-    cross_attention_module = kwargs.pop("cross_attention_module", None) or LinearAttention
-    cross_attn_block = LocalFeatureTransformer(
-        dim_model=feature_encoder.output_dim,
-        num_heads=num_attention_heads,
-        attention_directions=["cross"] * num_cross_attention_layers,
-        attention_module=cross_attention_module,
-    )
-
-    model = CREStereo(
-        feature_encoder=feature_encoder,
-        update_block=update_block,
-        flow_head=flow_head,
-        self_attn_block=self_attn_block,
-        cross_attn_block=cross_attn_block,
-        feature_downsample_rates=feature_downsample_rates,
-        correlation_groups=corr_groups,
-        search_window_1d=corr_search_window_1d,
-        search_window_2d=corr_search_window_2d,
-        search_dilate_1d=corr_search_dilate_1d,
-        search_dilate_2d=corr_search_dilate_2d,
-    )
-
-    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
-
-    return model
-
-
-_COMMON_META = {
-    "resize_size": (384, 512),
-}
-
-
-class CREStereo_Base_Weights(WeightsEnum):
-    """The metrics reported here are as follows.
-
-    ``mae`` is the "mean-average-error" and indicates how far (in pixels) the
-    predicted disparity is from its true value (equivalent to ``epe``). This is averaged over all pixels
-    of all images. ``1px``, ``3px``, ``5px`` and indicate the percentage of pixels that have a lower
-    error than that of the ground truth. ``relepe`` is the "relative-end-point-error" and is the
-    average ``epe`` divided by the average ground truth disparity. ``fl-all`` corresponds to the average of pixels whose epe
-    is either <3px, or whom's ``relepe`` is lower than 0.05 (therefore higher is better).
-
-    """
-
-    MEGVII_V1 = Weights(
-        # Weights ported from https://github.com/megvii-research/CREStereo
-        url="https://download.pytorch.org/models/crestereo-756c8b0f.pth",
-        transforms=StereoMatching,
-        meta={
-            **_COMMON_META,
-            "num_params": 5432948,
-            "recipe": "https://github.com/megvii-research/CREStereo",
-            "_metrics": {
-                "Middlebury2014-train": {
-                    # metrics for 10 refinement iterations and 1 cascade
-                    "mae": 0.792,
-                    "rmse": 2.765,
-                    "1px": 0.905,
-                    "3px": 0.958,
-                    "5px": 0.97,
-                    "relepe": 0.114,
-                    "fl-all": 90.429,
-                    "_detailed": {
-                        # 1 is the number of cascades
-                        1: {
-                            # 2 is number of refininement interations
-                            2: {
-                                "mae": 1.704,
-                                "rmse": 3.738,
-                                "1px": 0.738,
-                                "3px": 0.896,
-                                "5px": 0.933,
-                                "relepe": 0.157,
-                                "fl-all": 76.464,
-                            },
-                            5: {
-                                "mae": 0.956,
-                                "rmse": 2.963,
-                                "1px": 0.88,
-                                "3px": 0.948,
-                                "5px": 0.965,
-                                "relepe": 0.124,
-                                "fl-all": 88.186,
-                            },
-                            10: {
-                                "mae": 0.792,
-                                "rmse": 2.765,
-                                "1px": 0.905,
-                                "3px": 0.958,
-                                "5px": 0.97,
-                                "relepe": 0.114,
-                                "fl-all": 90.429,
-                            },
-                            20: {
-                                "mae": 0.749,
-                                "rmse": 2.706,
-                                "1px": 0.907,
-                                "3px": 0.961,
-                                "5px": 0.972,
-                                "relepe": 0.113,
-                                "fl-all": 90.807,
-                            },
-                        },
-                        2: {
-                            2: {
-                                "mae": 1.702,
-                                "rmse": 3.784,
-                                "1px": 0.784,
-                                "3px": 0.894,
-                                "5px": 0.924,
-                                "relepe": 0.172,
-                                "fl-all": 80.313,
-                            },
-                            5: {
-                                "mae": 0.932,
-                                "rmse": 2.907,
-                                "1px": 0.877,
-                                "3px": 0.944,
-                                "5px": 0.963,
-                                "relepe": 0.125,
-                                "fl-all": 87.979,
-                            },
-                            10: {
-                                "mae": 0.773,
-                                "rmse": 2.768,
-                                "1px": 0.901,
-                                "3px": 0.958,
-                                "5px": 0.972,
-                                "relepe": 0.117,
-                                "fl-all": 90.43,
-                            },
-                            20: {
-                                "mae": 0.854,
-                                "rmse": 2.971,
-                                "1px": 0.9,
-                                "3px": 0.957,
-                                "5px": 0.97,
-                                "relepe": 0.122,
-                                "fl-all": 90.269,
-                            },
-                        },
-                    },
-                }
-            },
-            "_docs": """These weights were ported from the original paper. They
-            are trained on a dataset mixture of the author's choice.""",
-        },
-    )
-
-    CRESTEREO_ETH_MBL_V1 = Weights(
-        # Weights ported from https://github.com/megvii-research/CREStereo
-        url="https://download.pytorch.org/models/crestereo-8f0e0e9a.pth",
-        transforms=StereoMatching,
-        meta={
-            **_COMMON_META,
-            "num_params": 5432948,
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/depth/stereo",
-            "_metrics": {
-                "Middlebury2014-train": {
-                    # metrics for 10 refinement iterations and 1 cascade
-                    "mae": 1.416,
-                    "rmse": 3.53,
-                    "1px": 0.777,
-                    "3px": 0.896,
-                    "5px": 0.933,
-                    "relepe": 0.148,
-                    "fl-all": 78.388,
-                    "_detailed": {
-                        # 1 is the number of cascades
-                        1: {
-                            # 2 is the number of refinement iterations
-                            2: {
-                                "mae": 2.363,
-                                "rmse": 4.352,
-                                "1px": 0.611,
-                                "3px": 0.828,
-                                "5px": 0.891,
-                                "relepe": 0.176,
-                                "fl-all": 64.511,
-                            },
-                            5: {
-                                "mae": 1.618,
-                                "rmse": 3.71,
-                                "1px": 0.761,
-                                "3px": 0.879,
-                                "5px": 0.918,
-                                "relepe": 0.154,
-                                "fl-all": 77.128,
-                            },
-                            10: {
-                                "mae": 1.416,
-                                "rmse": 3.53,
-                                "1px": 0.777,
-                                "3px": 0.896,
-                                "5px": 0.933,
-                                "relepe": 0.148,
-                                "fl-all": 78.388,
-                            },
-                            20: {
-                                "mae": 1.448,
-                                "rmse": 3.583,
-                                "1px": 0.771,
-                                "3px": 0.893,
-                                "5px": 0.931,
-                                "relepe": 0.145,
-                                "fl-all": 77.7,
-                            },
-                        },
-                        2: {
-                            2: {
-                                "mae": 1.972,
-                                "rmse": 4.125,
-                                "1px": 0.73,
-                                "3px": 0.865,
-                                "5px": 0.908,
-                                "relepe": 0.169,
-                                "fl-all": 74.396,
-                            },
-                            5: {
-                                "mae": 1.403,
-                                "rmse": 3.448,
-                                "1px": 0.793,
-                                "3px": 0.905,
-                                "5px": 0.937,
-                                "relepe": 0.151,
-                                "fl-all": 80.186,
-                            },
-                            10: {
-                                "mae": 1.312,
-                                "rmse": 3.368,
-                                "1px": 0.799,
-                                "3px": 0.912,
-                                "5px": 0.943,
-                                "relepe": 0.148,
-                                "fl-all": 80.379,
-                            },
-                            20: {
-                                "mae": 1.376,
-                                "rmse": 3.542,
-                                "1px": 0.796,
-                                "3px": 0.91,
-                                "5px": 0.942,
-                                "relepe": 0.149,
-                                "fl-all": 80.054,
-                            },
-                        },
-                    },
-                }
-            },
-            "_docs": """These weights were trained from scratch on
-            :class:`~torchvision.datasets._stereo_matching.CREStereo` +
-            :class:`~torchvision.datasets._stereo_matching.Middlebury2014Stereo` +
-            :class:`~torchvision.datasets._stereo_matching.ETH3DStereo`.""",
-        },
-    )
-
-    CRESTEREO_FINETUNE_MULTI_V1 = Weights(
-        # Weights ported from https://github.com/megvii-research/CREStereo
-        url="https://download.pytorch.org/models/crestereo-697c38f4.pth	",
-        transforms=StereoMatching,
-        meta={
-            **_COMMON_META,
-            "num_params": 5432948,
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/depth/stereo",
-            "_metrics": {
-                "Middlebury2014-train": {
-                    # metrics for 10 refinement iterations and 1 cascade
-                    "mae": 1.038,
-                    "rmse": 3.108,
-                    "1px": 0.852,
-                    "3px": 0.942,
-                    "5px": 0.963,
-                    "relepe": 0.129,
-                    "fl-all": 85.522,
-                    "_detailed": {
-                        # 1 is the number of cascades
-                        1: {
-                            # 2 is number of refininement interations
-                            2: {
-                                "mae": 1.85,
-                                "rmse": 3.797,
-                                "1px": 0.673,
-                                "3px": 0.862,
-                                "5px": 0.917,
-                                "relepe": 0.171,
-                                "fl-all": 69.736,
-                            },
-                            5: {
-                                "mae": 1.111,
-                                "rmse": 3.166,
-                                "1px": 0.838,
-                                "3px": 0.93,
-                                "5px": 0.957,
-                                "relepe": 0.134,
-                                "fl-all": 84.596,
-                            },
-                            10: {
-                                "mae": 1.02,
-                                "rmse": 3.073,
-                                "1px": 0.854,
-                                "3px": 0.938,
-                                "5px": 0.96,
-                                "relepe": 0.129,
-                                "fl-all": 86.042,
-                            },
-                            20: {
-                                "mae": 0.993,
-                                "rmse": 3.059,
-                                "1px": 0.855,
-                                "3px": 0.942,
-                                "5px": 0.967,
-                                "relepe": 0.126,
-                                "fl-all": 85.784,
-                            },
-                        },
-                        2: {
-                            2: {
-                                "mae": 1.667,
-                                "rmse": 3.867,
-                                "1px": 0.78,
-                                "3px": 0.891,
-                                "5px": 0.922,
-                                "relepe": 0.165,
-                                "fl-all": 78.89,
-                            },
-                            5: {
-                                "mae": 1.158,
-                                "rmse": 3.278,
-                                "1px": 0.843,
-                                "3px": 0.926,
-                                "5px": 0.955,
-                                "relepe": 0.135,
-                                "fl-all": 84.556,
-                            },
-                            10: {
-                                "mae": 1.046,
-                                "rmse": 3.13,
-                                "1px": 0.85,
-                                "3px": 0.934,
-                                "5px": 0.96,
-                                "relepe": 0.13,
-                                "fl-all": 85.464,
-                            },
-                            20: {
-                                "mae": 1.021,
-                                "rmse": 3.102,
-                                "1px": 0.85,
-                                "3px": 0.935,
-                                "5px": 0.963,
-                                "relepe": 0.129,
-                                "fl-all": 85.417,
-                            },
-                        },
-                    },
-                },
-            },
-            "_docs": """These weights were finetuned on a mixture of
-            :class:`~torchvision.datasets._stereo_matching.CREStereo` +
-            :class:`~torchvision.datasets._stereo_matching.Middlebury2014Stereo` +
-            :class:`~torchvision.datasets._stereo_matching.ETH3DStereo` +
-            :class:`~torchvision.datasets._stereo_matching.InStereo2k` +
-            :class:`~torchvision.datasets._stereo_matching.CarlaStereo` +
-            :class:`~torchvision.datasets._stereo_matching.SintelStereo` +
-            :class:`~torchvision.datasets._stereo_matching.FallingThingsStereo` +
-            .""",
-        },
-    )
-
-    DEFAULT = MEGVII_V1
-
-
-@register_model()
-@handle_legacy_interface(weights=("pretrained", CREStereo_Base_Weights.MEGVII_V1))
-def crestereo_base(*, weights: Optional[CREStereo_Base_Weights] = None, progress=True, **kwargs) -> CREStereo:
-    """CREStereo model from
-    `Practical Stereo Matching via Cascaded Recurrent Network
-    With Adaptive Correlation <https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf>`_.
-
-    Please see the example below for a tutorial on how to use this model.
-
-    Args:
-        weights(:class:`~torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights`, optional): The
-            pretrained weights to use. See
-            :class:`~torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights`
-            below for more details, and possible values. By default, no
-            pre-trained weights are used.
-        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
-            base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/crestereo.py>`_
-            for more details about this class.
-
-    .. autoclass:: torchvision.prototype.models.depth.stereo.CREStereo_Base_Weights
-        :members:
-    """
-    return _crestereo(
-        weights=weights,
-        progress=progress,
-        # Feature encoder
-        feature_encoder_layers=(64, 64, 96, 128, 256),
-        feature_encoder_strides=(2, 1, 2, 1),
-        feature_encoder_block=partial(raft.ResidualBlock, always_project=True),
-        feature_encoder_norm_layer=nn.InstanceNorm2d,
-        # Average pooling pyramid
-        feature_downsample_rates=(2, 4),
-        # Motion encoder
-        motion_encoder_corr_layers=(256, 192),
-        motion_encoder_flow_layers=(128, 64),
-        motion_encoder_out_channels=128,
-        # Recurrent block
-        recurrent_block_hidden_state_size=128,
-        recurrent_block_kernel_size=((1, 5), (5, 1)),
-        recurrent_block_padding=((0, 2), (2, 0)),
-        # Flow head
-        flow_head_hidden_size=256,
-        # Transformer blocks
-        num_attention_heads=8,
-        num_self_attention_layers=1,
-        num_cross_attention_layers=1,
-        self_attention_module=LinearAttention,
-        cross_attention_module=LinearAttention,
-        # Adaptive Correlation layer
-        corr_groups=4,
-        corr_search_window_2d=(3, 3),
-        corr_search_dilate_2d=(1, 1),
-        corr_search_window_1d=(1, 9),
-        corr_search_dilate_1d=(1, 1),
-    )
diff --git a/torchvision/prototype/models/depth/stereo/raft_stereo.py b/torchvision/prototype/models/depth/stereo/raft_stereo.py
deleted file mode 100644
index 541a11f0434..00000000000
--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ /dev/null
@@ -1,750 +0,0 @@
-from typing import Callable, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.models.optical_flow.raft as raft
-from torch import Tensor
-from torchvision.models._api import register_model, WeightsEnum
-from torchvision.models._utils import handle_legacy_interface
-from torchvision.models.optical_flow._utils import grid_sample, make_coords_grid, upsample_flow
-from torchvision.models.optical_flow.raft import FlowHead, MotionEncoder, ResidualBlock
-from torchvision.ops import Conv2dNormActivation
-from torchvision.utils import _log_api_usage_once
-
-
-__all__ = (
-    "RaftStereo",
-    "raft_stereo_base",
-    "raft_stereo_realtime",
-    "Raft_Stereo_Base_Weights",
-    "Raft_Stereo_Realtime_Weights",
-)
-
-
-class BaseEncoder(raft.FeatureEncoder):
-    """Base encoder for FeatureEncoder and ContextEncoder in which weight may be shared.
-
-    See the Raft-Stereo paper section 4.6 on backbone part.
-    """
-
-    def __init__(
-        self,
-        *,
-        block: Callable[..., nn.Module] = ResidualBlock,
-        layers: Tuple[int, int, int, int] = (64, 64, 96, 128),
-        strides: Tuple[int, int, int, int] = (2, 1, 2, 2),
-        norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d,
-    ):
-        # We use layers + (256,) because raft.FeatureEncoder require 5 layers
-        # but here we will set the last conv layer to identity
-        super().__init__(block=block, layers=layers + (256,), strides=strides, norm_layer=norm_layer)
-
-        # Base encoder don't have the last conv layer of feature encoder
-        self.conv = nn.Identity()
-
-        self.output_dim = layers[3]
-        num_downsampling = sum([x - 1 for x in strides])
-        self.downsampling_ratio = 2 ** (num_downsampling)
-
-
-class FeatureEncoder(nn.Module):
-    """Feature Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Context Encoder.
-
-    The FeatureEncoder takes concatination of left and right image as input, it produce feature embedding that later
-    will be used to construct correlation volume.
-    """
-
-    def __init__(
-        self,
-        base_encoder: BaseEncoder,
-        output_dim: int = 256,
-        shared_base: bool = False,
-        block: Callable[..., nn.Module] = ResidualBlock,
-    ):
-        super().__init__()
-        self.base_encoder = base_encoder
-        self.base_downsampling_ratio = base_encoder.downsampling_ratio
-        base_dim = base_encoder.output_dim
-
-        if not shared_base:
-            self.residual_block: nn.Module = nn.Identity()
-            self.conv = nn.Conv2d(base_dim, output_dim, kernel_size=1)
-        else:
-            # If we share base encoder weight for Feature and Context Encoder
-            # we need to add residual block with InstanceNorm2d and change the kernel size for conv layer
-            # see: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/raft_stereo.py#L35-L37
-            self.residual_block = block(base_dim, base_dim, norm_layer=nn.InstanceNorm2d, stride=1)
-            self.conv = nn.Conv2d(base_dim, output_dim, kernel_size=3, padding=1)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.base_encoder(x)
-        x = self.residual_block(x)
-        x = self.conv(x)
-        return x
-
-
-class MultiLevelContextEncoder(nn.Module):
-    """Context Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Feature Encoder.
-
-    The ContextEncoder takes left image as input and it outputs concatenated hidden_states and contexts.
-    In Raft-Stereo we have multi level GRUs and this context encoder will also multi outputs (list of Tensor)
-    that correspond to each GRUs.
-    Take note that the length of "out_with_blocks" parameter represent the number of GRU's level.
-    args:
-        base_encoder (nn.Module): The base encoder part that can have a shared weight with feature_encoder's
-            base_encoder because they have same architecture.
-        out_with_blocks (List[bool]): The length represent the number of GRU's level (length of output), and
-            if the element is True then the output layer on that position will have additional block
-        output_dim (int): The dimension of output on each level (default: 256)
-        block (Callable[..., nn.Module]): The type of basic block used for downsampling and output layer
-            (default: ResidualBlock)
-    """
-
-    def __init__(
-        self,
-        base_encoder: nn.Module,
-        out_with_blocks: List[bool],
-        output_dim: int = 256,
-        block: Callable[..., nn.Module] = ResidualBlock,
-    ):
-        super().__init__()
-        self.num_level = len(out_with_blocks)
-        self.base_encoder = base_encoder
-        self.base_downsampling_ratio = base_encoder.downsampling_ratio
-        base_dim = base_encoder.output_dim
-
-        self.downsample_and_out_layers = nn.ModuleList(
-            [
-                nn.ModuleDict(
-                    {
-                        "downsampler": self._make_downsampler(block, base_dim, base_dim) if i > 0 else nn.Identity(),
-                        "out_hidden_state": self._make_out_layer(
-                            base_dim, output_dim // 2, with_block=out_with_blocks[i], block=block
-                        ),
-                        "out_context": self._make_out_layer(
-                            base_dim, output_dim // 2, with_block=out_with_blocks[i], block=block
-                        ),
-                    }
-                )
-                for i in range(self.num_level)
-            ]
-        )
-
-    def _make_out_layer(self, in_channels, out_channels, with_block=True, block=ResidualBlock):
-        layers = []
-        if with_block:
-            layers.append(block(in_channels, in_channels, norm_layer=nn.BatchNorm2d, stride=1))
-        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
-        return nn.Sequential(*layers)
-
-    def _make_downsampler(self, block, in_channels, out_channels):
-        block1 = block(in_channels, out_channels, norm_layer=nn.BatchNorm2d, stride=2)
-        block2 = block(out_channels, out_channels, norm_layer=nn.BatchNorm2d, stride=1)
-        return nn.Sequential(block1, block2)
-
-    def forward(self, x: Tensor) -> List[Tensor]:
-        x = self.base_encoder(x)
-        outs = []
-        for layer_dict in self.downsample_and_out_layers:
-            x = layer_dict["downsampler"](x)
-            outs.append(torch.cat([layer_dict["out_hidden_state"](x), layer_dict["out_context"](x)], dim=1))
-        return outs
-
-
-class ConvGRU(raft.ConvGRU):
-    """Convolutional Gru unit."""
-
-    # Modified from raft.ConvGRU to accept pre-convolved contexts,
-    # see: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/update.py#L23
-    def forward(self, h: Tensor, x: Tensor, context: List[Tensor]) -> Tensor:  # type: ignore[override]
-        hx = torch.cat([h, x], dim=1)
-        z = torch.sigmoid(self.convz(hx) + context[0])
-        r = torch.sigmoid(self.convr(hx) + context[1])
-        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)) + context[2])
-        h = (1 - z) * h + z * q
-        return h
-
-
-class MultiLevelUpdateBlock(nn.Module):
-    """The update block which contains the motion encoder and grus
-
-    It must expose a ``hidden_dims`` attribute which is the hidden dimension size of its gru blocks
-    """
-
-    def __init__(self, *, motion_encoder: MotionEncoder, hidden_dims: List[int]):
-        super().__init__()
-        self.motion_encoder = motion_encoder
-
-        # The GRU input size is the size of previous level hidden_dim plus next level hidden_dim
-        # if this is the first gru, then we replace previous level with motion_encoder output channels
-        # for the last GRU, we dont add the next level hidden_dim
-        gru_input_dims = []
-        for i in range(len(hidden_dims)):
-            input_dim = hidden_dims[i - 1] if i > 0 else motion_encoder.out_channels
-            if i < len(hidden_dims) - 1:
-                input_dim += hidden_dims[i + 1]
-            gru_input_dims.append(input_dim)
-
-        self.grus = nn.ModuleList(
-            [
-                ConvGRU(input_size=gru_input_dims[i], hidden_size=hidden_dims[i], kernel_size=3, padding=1)
-                # Ideally we should reverse the direction during forward to use the gru with smallest resolution first
-                # however currently there is no way to reverse a ModuleList that is jit script compatible
-                # hence we reverse the ordering of self.grus on the constructor instead
-                # see: https://github.com/pytorch/pytorch/issues/31772
-                for i in reversed(list(range(len(hidden_dims))))
-            ]
-        )
-
-        self.hidden_dims = hidden_dims
-
-    def forward(
-        self,
-        hidden_states: List[Tensor],
-        contexts: List[List[Tensor]],
-        corr_features: Tensor,
-        disparity: Tensor,
-        level_processed: List[bool],
-    ) -> List[Tensor]:
-        # We call it reverse_i because it has a reversed ordering compared to hidden_states
-        # see self.grus on the constructor for more detail
-        for reverse_i, gru in enumerate(self.grus):
-            i = len(self.grus) - 1 - reverse_i
-            if level_processed[i]:
-                # X is concatination of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
-                # upsampled hidden_dim (or nothing if not exist).
-                if i == 0:
-                    features = self.motion_encoder(disparity, corr_features)
-                else:
-                    # 2x downsampled features from larger hidden states
-                    features = F.avg_pool2d(hidden_states[i - 1], kernel_size=3, stride=2, padding=1)
-
-                if i < len(self.grus) - 1:
-                    # Concat with 2x upsampled features from smaller hidden states
-                    _, _, h, w = hidden_states[i + 1].shape
-                    features = torch.cat(
-                        [
-                            features,
-                            F.interpolate(
-                                hidden_states[i + 1], size=(2 * h, 2 * w), mode="bilinear", align_corners=True
-                            ),
-                        ],
-                        dim=1,
-                    )
-
-                hidden_states[i] = gru(hidden_states[i], features, contexts[i])
-
-                # NOTE: For slow-fast gru, we dont always want to calculate delta disparity for every call on UpdateBlock
-                # Hence we move the delta disparity calculation to the RAFT-Stereo main forward
-
-        return hidden_states
-
-
-class MaskPredictor(raft.MaskPredictor):
-    """Mask predictor to be used when upsampling the predicted disparity."""
-
-    # We add out_channels compared to raft.MaskPredictor
-    def __init__(self, *, in_channels: int, hidden_size: int, out_channels: int, multiplier: float = 0.25):
-        super(raft.MaskPredictor, self).__init__()
-        self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        self.conv = nn.Conv2d(hidden_size, out_channels, kernel_size=1, padding=0)
-        self.multiplier = multiplier
-
-
-class CorrPyramid1d(nn.Module):
-    """Row-wise correlation pyramid.
-
-    Create a row-wise correlation pyramid with ``num_levels`` level from the outputs of the feature encoder,
-    this correlation pyramid will later be used as index to create correlation features using CorrBlock1d.
-    """
-
-    def __init__(self, num_levels: int = 4):
-        super().__init__()
-        self.num_levels = num_levels
-
-    def forward(self, fmap1: Tensor, fmap2: Tensor) -> List[Tensor]:
-        """Build the correlation pyramid from two feature maps.
-
-        The correlation volume is first computed as the dot product of each pair (pixel_in_fmap1, pixel_in_fmap2) on the same row.
-        The last 2 dimensions of the correlation volume are then pooled num_levels times at different resolutions
-        to build the correlation pyramid.
-        """
-
-        torch._assert(
-            fmap1.shape == fmap2.shape,
-            f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)",
-        )
-
-        batch_size, num_channels, h, w = fmap1.shape
-        fmap1 = fmap1.view(batch_size, num_channels, h, w)
-        fmap2 = fmap2.view(batch_size, num_channels, h, w)
-
-        corr = torch.einsum("aijk,aijh->ajkh", fmap1, fmap2)
-        corr = corr.view(batch_size, h, w, 1, w)
-        corr_volume = corr / torch.sqrt(torch.tensor(num_channels, device=corr.device))
-
-        corr_volume = corr_volume.reshape(batch_size * h * w, 1, 1, w)
-        corr_pyramid = [corr_volume]
-        for _ in range(self.num_levels - 1):
-            corr_volume = F.avg_pool2d(corr_volume, kernel_size=(1, 2), stride=(1, 2))
-            corr_pyramid.append(corr_volume)
-
-        return corr_pyramid
-
-
-class CorrBlock1d(nn.Module):
-    """The row-wise correlation block.
-
-    Use indexes from correlation pyramid to create correlation features.
-    The "indexing" of a given centroid pixel x' is done by concatenating its surrounding row neighbours
-    within radius
-    """
-
-    def __init__(self, *, num_levels: int = 4, radius: int = 4):
-        super().__init__()
-        self.radius = radius
-        self.out_channels = num_levels * (2 * radius + 1)
-
-    def forward(self, centroids_coords: Tensor, corr_pyramid: List[Tensor]) -> Tensor:
-        """Return correlation features by indexing from the pyramid."""
-        neighborhood_side_len = 2 * self.radius + 1  # see note in __init__ about out_channels
-        di = torch.linspace(-self.radius, self.radius, neighborhood_side_len, device=centroids_coords.device)
-        di = di.view(1, 1, neighborhood_side_len, 1).to(centroids_coords.device)
-
-        batch_size, _, h, w = centroids_coords.shape  # _ = 2 but we only use the first one
-        # We only consider 1d and take the first dim only
-        centroids_coords = centroids_coords[:, :1].permute(0, 2, 3, 1).reshape(batch_size * h * w, 1, 1, 1)
-
-        indexed_pyramid = []
-        for corr_volume in corr_pyramid:
-            x0 = centroids_coords + di  # end shape is (batch_size * h * w, 1, side_len, 1)
-            y0 = torch.zeros_like(x0)
-            sampling_coords = torch.cat([x0, y0], dim=-1)
-            indexed_corr_volume = grid_sample(corr_volume, sampling_coords, align_corners=True, mode="bilinear").view(
-                batch_size, h, w, -1
-            )
-            indexed_pyramid.append(indexed_corr_volume)
-            centroids_coords = centroids_coords / 2
-
-        corr_features = torch.cat(indexed_pyramid, dim=-1).permute(0, 3, 1, 2).contiguous()
-
-        expected_output_shape = (batch_size, self.out_channels, h, w)
-        torch._assert(
-            corr_features.shape == expected_output_shape,
-            f"Output shape of index pyramid is incorrect. Should be {expected_output_shape}, got {corr_features.shape}",
-        )
-        return corr_features
-
-
-class RaftStereo(nn.Module):
-    def __init__(
-        self,
-        *,
-        feature_encoder: FeatureEncoder,
-        context_encoder: MultiLevelContextEncoder,
-        corr_pyramid: CorrPyramid1d,
-        corr_block: CorrBlock1d,
-        update_block: MultiLevelUpdateBlock,
-        disparity_head: nn.Module,
-        mask_predictor: Optional[nn.Module] = None,
-        slow_fast: bool = False,
-    ):
-        """RAFT-Stereo model from
-        `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
-
-        args:
-            feature_encoder (FeatureEncoder): The feature encoder. Its input is the concatenation of ``left_image`` and ``right_image``.
-            context_encoder (MultiLevelContextEncoder): The context encoder. Its input is ``left_image``.
-                It has multi-level output and each level will have 2 parts:
-
-                - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
-                  the ``update_block``
-
-            corr_pyramid (CorrPyramid1d): Module to buid the correlation pyramid from feature encoder output
-            corr_block (CorrBlock1d): The correlation block, which uses the correlation pyramid indexes
-                to create correlation features. It takes the coordinate of the centroid pixel and correlation pyramid
-                as input and returns the correlation features.
-                It must expose an ``out_channels`` attribute.
-
-            update_block (MultiLevelUpdateBlock): The update block, which contains the motion encoder, and the recurrent unit.
-                It takes as input the hidden state of its recurrent unit, the context, the correlation
-                features, and the current predicted disparity. It outputs an updated hidden state
-            disparity_head (nn.Module): The disparity head block will convert from the hidden state into changes in disparity.
-            mask_predictor (nn.Module, optional): Predicts the mask that will be used to upsample the predicted flow.
-                If ``None`` (default), the flow is upsampled using interpolation.
-            slow_fast (bool): A boolean that specify whether we should use slow-fast GRU or not. See RAFT-Stereo paper
-                on section 3.4 for more detail.
-        """
-        super().__init__()
-        _log_api_usage_once(self)
-
-        # This indicate that the disparity output will be only have 1 channel (represent horizontal axis).
-        # We need this because some stereo matching model like CREStereo might have 2 channel on the output
-        self.output_channels = 1
-
-        self.feature_encoder = feature_encoder
-        self.context_encoder = context_encoder
-
-        self.base_downsampling_ratio = feature_encoder.base_downsampling_ratio
-        self.num_level = self.context_encoder.num_level
-        self.corr_pyramid = corr_pyramid
-        self.corr_block = corr_block
-        self.update_block = update_block
-        self.disparity_head = disparity_head
-        self.mask_predictor = mask_predictor
-
-        hidden_dims = self.update_block.hidden_dims
-        # Follow the original implementation to do pre convolution on the context
-        # See: https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/raft_stereo.py#L32
-        self.context_convs = nn.ModuleList(
-            [nn.Conv2d(hidden_dims[i], hidden_dims[i] * 3, kernel_size=3, padding=1) for i in range(self.num_level)]
-        )
-        self.slow_fast = slow_fast
-
-    def forward(
-        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 12
-    ) -> List[Tensor]:
-        """
-        Return disparity predictions on every iterations as a list of Tensor.
-        args:
-            left_image (Tensor): The input left image with layout B, C, H, W
-            right_image (Tensor): The input right image with layout B, C, H, W
-            flow_init (Optional[Tensor]): Initial estimate for the disparity. Default: None
-            num_iters (int): Number of update block iteration on the largest resolution. Default: 12
-        """
-        batch_size, _, h, w = left_image.shape
-        torch._assert(
-            (h, w) == right_image.shape[-2:],
-            f"input images should have the same shape, instead got ({h}, {w}) != {right_image.shape[-2:]}",
-        )
-
-        torch._assert(
-            (h % self.base_downsampling_ratio == 0 and w % self.base_downsampling_ratio == 0),
-            f"input image H and W should be divisible by {self.base_downsampling_ratio}, insted got H={h} and W={w}",
-        )
-
-        fmaps = self.feature_encoder(torch.cat([left_image, right_image], dim=0))
-        fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
-        torch._assert(
-            fmap1.shape[-2:] == (h // self.base_downsampling_ratio, w // self.base_downsampling_ratio),
-            f"The feature encoder should downsample H and W by {self.base_downsampling_ratio}",
-        )
-
-        corr_pyramid = self.corr_pyramid(fmap1, fmap2)
-
-        # Multi level contexts
-        context_outs = self.context_encoder(left_image)
-
-        hidden_dims = self.update_block.hidden_dims
-        context_out_channels = [context_outs[i].shape[1] - hidden_dims[i] for i in range(len(context_outs))]
-        hidden_states: List[Tensor] = []
-        contexts: List[List[Tensor]] = []
-        for i, context_conv in enumerate(self.context_convs):
-            # As in the original paper, the actual output of the context encoder is split in 2 parts:
-            # - one part is used to initialize the hidden state of the recurent units of the update block
-            # - the rest is the "actual" context.
-            hidden_state, context = torch.split(context_outs[i], [hidden_dims[i], context_out_channels[i]], dim=1)
-            hidden_states.append(torch.tanh(hidden_state))
-            contexts.append(
-                torch.split(context_conv(F.relu(context)), [hidden_dims[i], hidden_dims[i], hidden_dims[i]], dim=1)
-            )
-
-        _, Cf, Hf, Wf = fmap1.shape
-        coords0 = make_coords_grid(batch_size, Hf, Wf).to(fmap1.device)
-        coords1 = make_coords_grid(batch_size, Hf, Wf).to(fmap1.device)
-
-        # We use flow_init for cascade inference
-        if flow_init is not None:
-            coords1 = coords1 + flow_init
-
-        disparity_predictions = []
-        for _ in range(num_iters):
-            coords1 = coords1.detach()  # Don't backpropagate gradients through this branch, see paper
-            corr_features = self.corr_block(centroids_coords=coords1, corr_pyramid=corr_pyramid)
-
-            disparity = coords1 - coords0
-            if self.slow_fast:
-                # Using slow_fast GRU (see paper section 3.4). The lower resolution are processed more often
-                for i in range(1, self.num_level):
-                    # We only processed the smallest i levels
-                    level_processed = [False] * (self.num_level - i) + [True] * i
-                    hidden_states = self.update_block(
-                        hidden_states, contexts, corr_features, disparity, level_processed=level_processed
-                    )
-            hidden_states = self.update_block(
-                hidden_states, contexts, corr_features, disparity, level_processed=[True] * self.num_level
-            )
-            # Take the largest hidden_state to get the disparity
-            hidden_state = hidden_states[0]
-            delta_disparity = self.disparity_head(hidden_state)
-            # in stereo mode, project disparity onto epipolar
-            delta_disparity[:, 1] = 0.0
-
-            coords1 = coords1 + delta_disparity
-            up_mask = None if self.mask_predictor is None else self.mask_predictor(hidden_state)
-            upsampled_disparity = upsample_flow(
-                (coords1 - coords0), up_mask=up_mask, factor=self.base_downsampling_ratio
-            )
-            disparity_predictions.append(upsampled_disparity[:, :1])
-
-        return disparity_predictions
-
-
-def _raft_stereo(
-    *,
-    weights: Optional[WeightsEnum],
-    progress: bool,
-    shared_encoder_weight: bool,
-    # Feature encoder
-    feature_encoder_layers: Tuple[int, int, int, int, int],
-    feature_encoder_strides: Tuple[int, int, int, int],
-    feature_encoder_block: Callable[..., nn.Module],
-    # Context encoder
-    context_encoder_layers: Tuple[int, int, int, int, int],
-    context_encoder_strides: Tuple[int, int, int, int],
-    # if the `out_with_blocks` param of the context_encoder is True, then
-    # the particular output on that level position will have additional `context_encoder_block` layer
-    context_encoder_out_with_blocks: List[bool],
-    context_encoder_block: Callable[..., nn.Module],
-    # Correlation block
-    corr_num_levels: int,
-    corr_radius: int,
-    # Motion encoder
-    motion_encoder_corr_layers: Tuple[int, int],
-    motion_encoder_flow_layers: Tuple[int, int],
-    motion_encoder_out_channels: int,
-    # Update block
-    update_block_hidden_dims: List[int],
-    # Flow Head
-    flow_head_hidden_size: int,
-    # Mask predictor
-    mask_predictor_hidden_size: int,
-    use_mask_predictor: bool,
-    slow_fast: bool,
-    **kwargs,
-):
-    if len(context_encoder_out_with_blocks) != len(update_block_hidden_dims):
-        raise ValueError(
-            "Length of context_encoder_out_with_blocks and update_block_hidden_dims must be the same"
-            + "because both of them represent the number of GRUs level"
-        )
-    if shared_encoder_weight:
-        if (
-            feature_encoder_layers[:-1] != context_encoder_layers[:-1]
-            or feature_encoder_strides != context_encoder_strides
-        ):
-            raise ValueError(
-                "If shared_encoder_weight is True, then the feature_encoder_layers[:-1]"
-                + " and feature_encoder_strides must be the same with context_encoder_layers[:-1] and context_encoder_strides!"
-            )
-
-        base_encoder = kwargs.pop("base_encoder", None) or BaseEncoder(
-            block=context_encoder_block,
-            layers=context_encoder_layers[:-1],
-            strides=context_encoder_strides,
-            norm_layer=nn.BatchNorm2d,
-        )
-        feature_base_encoder = base_encoder
-        context_base_encoder = base_encoder
-    else:
-        feature_base_encoder = BaseEncoder(
-            block=feature_encoder_block,
-            layers=feature_encoder_layers[:-1],
-            strides=feature_encoder_strides,
-            norm_layer=nn.InstanceNorm2d,
-        )
-        context_base_encoder = BaseEncoder(
-            block=context_encoder_block,
-            layers=context_encoder_layers[:-1],
-            strides=context_encoder_strides,
-            norm_layer=nn.BatchNorm2d,
-        )
-    feature_encoder = kwargs.pop("feature_encoder", None) or FeatureEncoder(
-        feature_base_encoder,
-        output_dim=feature_encoder_layers[-1],
-        shared_base=shared_encoder_weight,
-        block=feature_encoder_block,
-    )
-    context_encoder = kwargs.pop("context_encoder", None) or MultiLevelContextEncoder(
-        context_base_encoder,
-        out_with_blocks=context_encoder_out_with_blocks,
-        output_dim=context_encoder_layers[-1],
-        block=context_encoder_block,
-    )
-
-    feature_downsampling_ratio = feature_encoder.base_downsampling_ratio
-
-    corr_pyramid = kwargs.pop("corr_pyramid", None) or CorrPyramid1d(num_levels=corr_num_levels)
-    corr_block = kwargs.pop("corr_block", None) or CorrBlock1d(num_levels=corr_num_levels, radius=corr_radius)
-
-    motion_encoder = kwargs.pop("motion_encoder", None) or MotionEncoder(
-        in_channels_corr=corr_block.out_channels,
-        corr_layers=motion_encoder_corr_layers,
-        flow_layers=motion_encoder_flow_layers,
-        out_channels=motion_encoder_out_channels,
-    )
-    update_block = kwargs.pop("update_block", None) or MultiLevelUpdateBlock(
-        motion_encoder=motion_encoder, hidden_dims=update_block_hidden_dims
-    )
-
-    # We use the largest scale hidden_dims of update_block to get the predicted disparity
-    disparity_head = kwargs.pop("disparity_head", None) or FlowHead(
-        in_channels=update_block_hidden_dims[0],
-        hidden_size=flow_head_hidden_size,
-    )
-
-    mask_predictor = kwargs.pop("mask_predictor", None)
-    if use_mask_predictor:
-        mask_predictor = MaskPredictor(
-            in_channels=update_block.hidden_dims[0],
-            hidden_size=mask_predictor_hidden_size,
-            out_channels=9 * feature_downsampling_ratio * feature_downsampling_ratio,
-        )
-    else:
-        mask_predictor = None
-
-    model = RaftStereo(
-        feature_encoder=feature_encoder,
-        context_encoder=context_encoder,
-        corr_pyramid=corr_pyramid,
-        corr_block=corr_block,
-        update_block=update_block,
-        disparity_head=disparity_head,
-        mask_predictor=mask_predictor,
-        slow_fast=slow_fast,
-        **kwargs,  # not really needed, all params should be consumed by now
-    )
-
-    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
-
-    return model
-
-
-class Raft_Stereo_Realtime_Weights(WeightsEnum):
-    pass
-
-
-class Raft_Stereo_Base_Weights(WeightsEnum):
-    pass
-
-
-@register_model()
-@handle_legacy_interface(weights=("pretrained", None))
-def raft_stereo_realtime(
-    *, weights: Optional[Raft_Stereo_Realtime_Weights] = None, progress=True, **kwargs
-) -> RaftStereo:
-    """RAFT-Stereo model from
-    `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
-    This is the realtime variant of the Raft-Stereo model that is described on the paper section 4.7.
-
-    Please see the example below for a tutorial on how to use this model.
-
-    Args:
-        weights(:class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Realtime_Weights`, optional): The
-            pretrained weights to use. See
-            :class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Realtime_Weights`
-            below for more details, and possible values. By default, no
-            pre-trained weights are used.
-        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
-            base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/raft.py>`_
-            for more details about this class.
-
-    .. autoclass:: torchvision.prototype.models.depth.stereo.Raft_Stereo_Realtime_Weights
-        :members:
-    """
-
-    weights = Raft_Stereo_Realtime_Weights.verify(weights)
-
-    return _raft_stereo(
-        weights=weights,
-        progress=progress,
-        shared_encoder_weight=True,
-        # Feature encoder
-        feature_encoder_layers=(64, 64, 96, 128, 256),
-        feature_encoder_strides=(2, 1, 2, 2),
-        feature_encoder_block=ResidualBlock,
-        # Context encoder
-        context_encoder_layers=(64, 64, 96, 128, 256),
-        context_encoder_strides=(2, 1, 2, 2),
-        context_encoder_out_with_blocks=[True, True],
-        context_encoder_block=ResidualBlock,
-        # Correlation block
-        corr_num_levels=4,
-        corr_radius=4,
-        # Motion encoder
-        motion_encoder_corr_layers=(64, 64),
-        motion_encoder_flow_layers=(64, 64),
-        motion_encoder_out_channels=128,
-        # Update block
-        update_block_hidden_dims=[128, 128],
-        # Flow head
-        flow_head_hidden_size=256,
-        # Mask predictor
-        mask_predictor_hidden_size=256,
-        use_mask_predictor=True,
-        slow_fast=True,
-        **kwargs,
-    )
-
-
-@register_model()
-@handle_legacy_interface(weights=("pretrained", None))
-def raft_stereo_base(*, weights: Optional[Raft_Stereo_Base_Weights] = None, progress=True, **kwargs) -> RaftStereo:
-    """RAFT-Stereo model from
-    `RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching <https://arxiv.org/abs/2109.07547>`_.
-
-    Please see the example below for a tutorial on how to use this model.
-
-    Args:
-        weights(:class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Base_Weights`, optional): The
-            pretrained weights to use. See
-            :class:`~torchvision.prototype.models.depth.stereo.Raft_Stereo_Base_Weights`
-            below for more details, and possible values. By default, no
-            pre-trained weights are used.
-        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.prototype.models.depth.stereo.raft_stereo.RaftStereo``
-            base class. Please refer to the `source code
-            <https://github.com/pytorch/vision/blob/main/torchvision/models/optical_flow/raft.py>`_
-            for more details about this class.
-
-    .. autoclass:: torchvision.prototype.models.depth.stereo.Raft_Stereo_Base_Weights
-        :members:
-    """
-
-    weights = Raft_Stereo_Base_Weights.verify(weights)
-
-    return _raft_stereo(
-        weights=weights,
-        progress=progress,
-        shared_encoder_weight=False,
-        # Feature encoder
-        feature_encoder_layers=(64, 64, 96, 128, 256),
-        feature_encoder_strides=(1, 1, 2, 2),
-        feature_encoder_block=ResidualBlock,
-        # Context encoder
-        context_encoder_layers=(64, 64, 96, 128, 256),
-        context_encoder_strides=(1, 1, 2, 2),
-        context_encoder_out_with_blocks=[True, True, False],
-        context_encoder_block=ResidualBlock,
-        # Correlation block
-        corr_num_levels=4,
-        corr_radius=4,
-        # Motion encoder
-        motion_encoder_corr_layers=(64, 64),
-        motion_encoder_flow_layers=(64, 64),
-        motion_encoder_out_channels=128,
-        # Update block
-        update_block_hidden_dims=[128, 128, 128],
-        # Flow head
-        flow_head_hidden_size=256,
-        # Mask predictor
-        mask_predictor_hidden_size=256,
-        use_mask_predictor=True,
-        slow_fast=False,
-        **kwargs,
-    )
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
deleted file mode 100644
index 5324db63496..00000000000
--- a/torchvision/prototype/transforms/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
-
-from . import functional  # usort: skip
-
-from ._transform import Transform  # usort: skip
-from ._presets import StereoMatching  # usort: skip
-
-from ._augment import RandomCutmix, RandomErasing, RandomMixup, SimpleCopyPaste
-from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
-from ._color import (
-    ColorJitter,
-    RandomAdjustSharpness,
-    RandomAutocontrast,
-    RandomEqualize,
-    RandomInvert,
-    RandomPhotometricDistort,
-    RandomPosterize,
-    RandomSolarize,
-)
-from ._container import Compose, RandomApply, RandomChoice, RandomOrder
-from ._geometry import (
-    CenterCrop,
-    ElasticTransform,
-    FiveCrop,
-    FixedSizeCrop,
-    Pad,
-    RandomAffine,
-    RandomCrop,
-    RandomHorizontalFlip,
-    RandomIoUCrop,
-    RandomPerspective,
-    RandomResize,
-    RandomResizedCrop,
-    RandomRotation,
-    RandomShortestSize,
-    RandomVerticalFlip,
-    RandomZoomOut,
-    Resize,
-    ScaleJitter,
-    TenCrop,
-)
-from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat, ConvertColorSpace, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, RemoveSmallBoundingBoxes, ToDtype
-from ._type_conversion import DecodeImage, LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
-
-from ._deprecated import Grayscale, RandomGrayscale, ToTensor  # usort: skip
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
deleted file mode 100644
index 3cd925fd996..00000000000
--- a/torchvision/prototype/transforms/_augment.py
+++ /dev/null
@@ -1,381 +0,0 @@
-import math
-import numbers
-import warnings
-from typing import Any, cast, Dict, List, Optional, Tuple
-
-import PIL.Image
-import torch
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.ops import masks_to_boxes
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, InterpolationMode
-
-from ._transform import _RandomApplyTransform
-from ._utils import has_any, query_chw
-
-
-class RandomErasing(_RandomApplyTransform):
-    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image)
-
-    def __init__(
-        self,
-        p: float = 0.5,
-        scale: Tuple[float, float] = (0.02, 0.33),
-        ratio: Tuple[float, float] = (0.3, 3.3),
-        value: float = 0,
-        inplace: bool = False,
-    ):
-        super().__init__(p=p)
-        if not isinstance(value, (numbers.Number, str, tuple, list)):
-            raise TypeError("Argument value should be either a number or str or a sequence")
-        if isinstance(value, str) and value != "random":
-            raise ValueError("If value is str, it should be 'random'")
-        if not isinstance(scale, (tuple, list)):
-            raise TypeError("Scale should be a sequence")
-        if not isinstance(ratio, (tuple, list)):
-            raise TypeError("Ratio should be a sequence")
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("Scale and ratio should be of kind (min, max)")
-        if scale[0] < 0 or scale[1] > 1:
-            raise ValueError("Scale should be between 0 and 1")
-        self.scale = scale
-        self.ratio = ratio
-        self.value = value
-        self.inplace = inplace
-
-        self._log_ratio = torch.log(torch.tensor(self.ratio))
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        img_c, img_h, img_w = query_chw(sample)
-
-        if isinstance(self.value, (int, float)):
-            value = [self.value]
-        elif isinstance(self.value, str):
-            value = None
-        elif isinstance(self.value, tuple):
-            value = list(self.value)
-        else:
-            value = self.value
-
-        if value is not None and not (len(value) in (1, img_c)):
-            raise ValueError(
-                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
-            )
-
-        area = img_h * img_w
-
-        log_ratio = self._log_ratio
-        for _ in range(10):
-            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-            aspect_ratio = torch.exp(
-                torch.empty(1).uniform_(
-                    log_ratio[0],  # type: ignore[arg-type]
-                    log_ratio[1],  # type: ignore[arg-type]
-                )
-            ).item()
-
-            h = int(round(math.sqrt(erase_area * aspect_ratio)))
-            w = int(round(math.sqrt(erase_area / aspect_ratio)))
-            if not (h < img_h and w < img_w):
-                continue
-
-            if value is None:
-                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
-            else:
-                v = torch.tensor(value)[:, None, None]
-
-            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
-            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
-            break
-        else:
-            i, j, h, w, v = 0, 0, img_h, img_w, None
-
-        return dict(i=i, j=j, h=h, w=w, v=v)
-
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
-        if params["v"] is not None:
-            inpt = F.erase(inpt, **params, inplace=self.inplace)
-
-        return inpt
-
-
-class _BaseMixupCutmix(_RandomApplyTransform):
-    def __init__(self, alpha: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.alpha = alpha
-        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
-
-    def forward(self, *inputs: Any) -> Any:
-        if not (has_any(inputs, features.Image, features.is_simple_tensor) and has_any(inputs, features.OneHotLabel)):
-            raise TypeError(f"{type(self).__name__}() is only defined for tensor images and one-hot labels.")
-        if has_any(inputs, PIL.Image.Image, features.BoundingBox, features.Mask, features.Label):
-            raise TypeError(
-                f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
-            )
-        return super().forward(*inputs)
-
-    def _mixup_onehotlabel(self, inpt: features.OneHotLabel, lam: float) -> features.OneHotLabel:
-        if inpt.ndim < 2:
-            raise ValueError("Need a batch of one hot labels")
-        output = inpt.clone()
-        output = output.roll(1, -2).mul_(1 - lam).add_(output.mul_(lam))
-        return features.OneHotLabel.new_like(inpt, output)
-
-
-class RandomMixup(_BaseMixupCutmix):
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        return dict(lam=float(self._dist.sample(())))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        lam = params["lam"]
-        if isinstance(inpt, features.Image) or features.is_simple_tensor(inpt):
-            if inpt.ndim < 4:
-                raise ValueError("Need a batch of images")
-            output = inpt.clone()
-            output = output.roll(1, -4).mul_(1 - lam).add_(output.mul_(lam))
-
-            if isinstance(inpt, features.Image):
-                output = features.Image.new_like(inpt, output)
-
-            return output
-        elif isinstance(inpt, features.OneHotLabel):
-            return self._mixup_onehotlabel(inpt, lam)
-        else:
-            return inpt
-
-
-class RandomCutmix(_BaseMixupCutmix):
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        lam = float(self._dist.sample(()))
-
-        _, H, W = query_chw(sample)
-
-        r_x = torch.randint(W, ())
-        r_y = torch.randint(H, ())
-
-        r = 0.5 * math.sqrt(1.0 - lam)
-        r_w_half = int(r * W)
-        r_h_half = int(r * H)
-
-        x1 = int(torch.clamp(r_x - r_w_half, min=0))
-        y1 = int(torch.clamp(r_y - r_h_half, min=0))
-        x2 = int(torch.clamp(r_x + r_w_half, max=W))
-        y2 = int(torch.clamp(r_y + r_h_half, max=H))
-        box = (x1, y1, x2, y2)
-
-        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
-
-        return dict(box=box, lam_adjusted=lam_adjusted)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, features.Image) or features.is_simple_tensor(inpt):
-            box = params["box"]
-            if inpt.ndim < 4:
-                raise ValueError("Need a batch of images")
-            x1, y1, x2, y2 = box
-            image_rolled = inpt.roll(1, -4)
-            output = inpt.clone()
-            output[..., y1:y2, x1:x2] = image_rolled[..., y1:y2, x1:x2]
-
-            if isinstance(inpt, features.Image):
-                output = features.Image.new_like(inpt, output)
-
-            return output
-        elif isinstance(inpt, features.OneHotLabel):
-            lam_adjusted = params["lam_adjusted"]
-            return self._mixup_onehotlabel(inpt, lam_adjusted)
-        else:
-            return inpt
-
-
-class SimpleCopyPaste(_RandomApplyTransform):
-    def __init__(
-        self,
-        p: float = 0.5,
-        blending: bool = True,
-        resize_interpolation: InterpolationMode = F.InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__(p=p)
-        self.resize_interpolation = resize_interpolation
-        self.blending = blending
-        self.antialias = antialias
-
-    def _copy_paste(
-        self,
-        image: features.TensorImageType,
-        target: Dict[str, Any],
-        paste_image: features.TensorImageType,
-        paste_target: Dict[str, Any],
-        random_selection: torch.Tensor,
-        blending: bool,
-        resize_interpolation: F.InterpolationMode,
-        antialias: Optional[bool],
-    ) -> Tuple[features.TensorImageType, Dict[str, Any]]:
-
-        paste_masks = paste_target["masks"].new_like(paste_target["masks"], paste_target["masks"][random_selection])
-        paste_boxes = paste_target["boxes"].new_like(paste_target["boxes"], paste_target["boxes"][random_selection])
-        paste_labels = paste_target["labels"].new_like(paste_target["labels"], paste_target["labels"][random_selection])
-
-        masks = target["masks"]
-
-        # We resize source and paste data if they have different sizes
-        # This is something different to TF implementation we introduced here as
-        # originally the algorithm works on equal-sized data
-        # (for example, coming from LSJ data augmentations)
-        size1 = cast(List[int], image.shape[-2:])
-        size2 = paste_image.shape[-2:]
-        if size1 != size2:
-            paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation, antialias=antialias)
-            paste_masks = F.resize(paste_masks, size=size1)
-            paste_boxes = F.resize(paste_boxes, size=size1)
-
-        paste_alpha_mask = paste_masks.sum(dim=0) > 0
-
-        if blending:
-            paste_alpha_mask = F.gaussian_blur(paste_alpha_mask.unsqueeze(0), kernel_size=[5, 5], sigma=[2.0])
-
-        # Copy-paste images:
-        image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
-
-        # Copy-paste masks:
-        masks = masks * (~paste_alpha_mask)
-        non_all_zero_masks = masks.sum((-1, -2)) > 0
-        masks = masks[non_all_zero_masks]
-
-        # Do a shallow copy of the target dict
-        out_target = {k: v for k, v in target.items()}
-
-        out_target["masks"] = torch.cat([masks, paste_masks])
-
-        # Copy-paste boxes and labels
-        bbox_format = target["boxes"].format
-        xyxy_boxes = masks_to_boxes(masks)
-        # masks_to_boxes produces bboxes with x2y2 inclusive but x2y2 should be exclusive
-        # we need to add +1 to x2y2.
-        # There is a similar +1 in other reference implementations:
-        # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
-        xyxy_boxes[:, 2:] += 1
-        boxes = F.convert_format_bounding_box(
-            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False
-        )
-        out_target["boxes"] = torch.cat([boxes, paste_boxes])
-
-        labels = target["labels"][non_all_zero_masks]
-        out_target["labels"] = torch.cat([labels, paste_labels])
-
-        # Check for degenerated boxes and remove them
-        boxes = F.convert_format_bounding_box(
-            out_target["boxes"], old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
-        if degenerate_boxes.any():
-            valid_targets = ~degenerate_boxes.any(dim=1)
-
-            out_target["boxes"] = boxes[valid_targets]
-            out_target["masks"] = out_target["masks"][valid_targets]
-            out_target["labels"] = out_target["labels"][valid_targets]
-
-        return image, out_target
-
-    def _extract_image_targets(
-        self, flat_sample: List[Any]
-    ) -> Tuple[List[features.TensorImageType], List[Dict[str, Any]]]:
-        # fetch all images, bboxes, masks and labels from unstructured input
-        # with List[image], List[BoundingBox], List[Mask], List[Label]
-        images, bboxes, masks, labels = [], [], [], []
-        for obj in flat_sample:
-            if isinstance(obj, features.Image) or features.is_simple_tensor(obj):
-                images.append(obj)
-            elif isinstance(obj, PIL.Image.Image):
-                images.append(F.to_image_tensor(obj))
-            elif isinstance(obj, features.BoundingBox):
-                bboxes.append(obj)
-            elif isinstance(obj, features.Mask):
-                masks.append(obj)
-            elif isinstance(obj, (features.Label, features.OneHotLabel)):
-                labels.append(obj)
-
-        if not (len(images) == len(bboxes) == len(masks) == len(labels)):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain equal sized list of Images, "
-                "BoundingBoxes, Masks and Labels or OneHotLabels."
-            )
-
-        targets = []
-        for bbox, mask, label in zip(bboxes, masks, labels):
-            targets.append({"boxes": bbox, "masks": mask, "labels": label})
-
-        return images, targets
-
-    def _insert_outputs(
-        self,
-        flat_sample: List[Any],
-        output_images: List[features.TensorImageType],
-        output_targets: List[Dict[str, Any]],
-    ) -> None:
-        c0, c1, c2, c3 = 0, 0, 0, 0
-        for i, obj in enumerate(flat_sample):
-            if isinstance(obj, features.Image):
-                flat_sample[i] = features.Image.new_like(obj, output_images[c0])
-                c0 += 1
-            elif isinstance(obj, PIL.Image.Image):
-                flat_sample[i] = F.to_image_pil(output_images[c0])
-                c0 += 1
-            elif features.is_simple_tensor(obj):
-                flat_sample[i] = output_images[c0]
-                c0 += 1
-            elif isinstance(obj, features.BoundingBox):
-                flat_sample[i] = features.BoundingBox.new_like(obj, output_targets[c1]["boxes"])
-                c1 += 1
-            elif isinstance(obj, features.Mask):
-                flat_sample[i] = features.Mask.new_like(obj, output_targets[c2]["masks"])
-                c2 += 1
-            elif isinstance(obj, (features.Label, features.OneHotLabel)):
-                flat_sample[i] = obj.new_like(obj, output_targets[c3]["labels"])  # type: ignore[arg-type]
-                c3 += 1
-
-    def forward(self, *inputs: Any) -> Any:
-        flat_sample, spec = tree_flatten(inputs)
-
-        images, targets = self._extract_image_targets(flat_sample)
-
-        # images = [t1, t2, ..., tN]
-        # Let's define paste_images as shifted list of input images
-        # paste_images = [t2, t3, ..., tN, t1]
-        # FYI: in TF they mix data on the dataset level
-        images_rolled = images[-1:] + images[:-1]
-        targets_rolled = targets[-1:] + targets[:-1]
-
-        output_images, output_targets = [], []
-
-        for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled):
-
-            # Random paste targets selection:
-            num_masks = len(paste_target["masks"])
-
-            if num_masks < 1:
-                # Such degerante case with num_masks=0 can happen with LSJ
-                # Let's just return (image, target)
-                output_image, output_target = image, target
-            else:
-                random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device)
-                random_selection = torch.unique(random_selection)
-
-                output_image, output_target = self._copy_paste(
-                    image,
-                    target,
-                    paste_image,
-                    paste_target,
-                    random_selection=random_selection,
-                    blending=self.blending,
-                    resize_interpolation=self.resize_interpolation,
-                    antialias=self.antialias,
-                )
-            output_images.append(output_image)
-            output_targets.append(output_target)
-
-        # Insert updated images and targets into input flat_sample
-        self._insert_outputs(flat_sample, output_images, output_targets)
-
-        return tree_unflatten(flat_sample, spec)
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
deleted file mode 100644
index c98e5c36e4a..00000000000
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ /dev/null
@@ -1,525 +0,0 @@
-import math
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, TypeVar, Union
-
-import PIL.Image
-import torch
-
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype import features
-from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
-from torchvision.prototype.transforms.functional._meta import get_chw
-
-from ._utils import _isinstance, _setup_fill_arg
-
-K = TypeVar("K")
-V = TypeVar("V")
-
-
-class _AutoAugmentBase(Transform):
-    def __init__(
-        self,
-        *,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
-    ) -> None:
-        super().__init__()
-        self.interpolation = interpolation
-        self.fill = _setup_fill_arg(fill)
-
-    def _get_random_item(self, dct: Dict[K, V]) -> Tuple[K, V]:
-        keys = tuple(dct.keys())
-        key = keys[int(torch.randint(len(keys), ()))]
-        return key, dct[key]
-
-    def _extract_image(
-        self,
-        sample: Any,
-        unsupported_types: Tuple[Type, ...] = (features.BoundingBox, features.Mask),
-    ) -> Tuple[int, features.ImageType]:
-        sample_flat, _ = tree_flatten(sample)
-        images = []
-        for id, inpt in enumerate(sample_flat):
-            if _isinstance(inpt, (features.Image, PIL.Image.Image, features.is_simple_tensor)):
-                images.append((id, inpt))
-            elif isinstance(inpt, unsupported_types):
-                raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
-
-        if not images:
-            raise TypeError("Found no image in the sample.")
-        if len(images) > 1:
-            raise TypeError(
-                f"Auto augment transformations are only properly defined for a single image, but found {len(images)}."
-            )
-        return images[0]
-
-    def _put_into_sample(self, sample: Any, id: int, item: Any) -> Any:
-        sample_flat, spec = tree_flatten(sample)
-        sample_flat[id] = item
-        return tree_unflatten(sample_flat, spec)
-
-    def _apply_image_transform(
-        self,
-        image: features.ImageType,
-        transform_id: str,
-        magnitude: float,
-        interpolation: InterpolationMode,
-        fill: Dict[Type, features.FillType],
-    ) -> features.ImageType:
-        fill_ = fill[type(image)]
-        fill_ = F._geometry._convert_fill_arg(fill_)
-
-        if transform_id == "Identity":
-            return image
-        elif transform_id == "ShearX":
-            # magnitude should be arctan(magnitude)
-            # official autoaug: (1, level, 0, 0, 1, 0)
-            # https://github.com/tensorflow/models/blob/dd02069717128186b88afa8d857ce57d17957f03/research/autoaugment/augmentation_transforms.py#L290
-            # compared to
-            # torchvision:      (1, tan(level), 0, 0, 1, 0)
-            # https://github.com/pytorch/vision/blob/0c2373d0bba3499e95776e7936e207d8a1676e65/torchvision/transforms/functional.py#L976
-            return F.affine(
-                image,
-                angle=0.0,
-                translate=[0, 0],
-                scale=1.0,
-                shear=[math.degrees(math.atan(magnitude)), 0.0],
-                interpolation=interpolation,
-                fill=fill_,
-                center=[0, 0],
-            )
-        elif transform_id == "ShearY":
-            # magnitude should be arctan(magnitude)
-            # See above
-            return F.affine(
-                image,
-                angle=0.0,
-                translate=[0, 0],
-                scale=1.0,
-                shear=[0.0, math.degrees(math.atan(magnitude))],
-                interpolation=interpolation,
-                fill=fill_,
-                center=[0, 0],
-            )
-        elif transform_id == "TranslateX":
-            return F.affine(
-                image,
-                angle=0.0,
-                translate=[int(magnitude), 0],
-                scale=1.0,
-                interpolation=interpolation,
-                shear=[0.0, 0.0],
-                fill=fill_,
-            )
-        elif transform_id == "TranslateY":
-            return F.affine(
-                image,
-                angle=0.0,
-                translate=[0, int(magnitude)],
-                scale=1.0,
-                interpolation=interpolation,
-                shear=[0.0, 0.0],
-                fill=fill_,
-            )
-        elif transform_id == "Rotate":
-            return F.rotate(image, angle=magnitude, interpolation=interpolation, fill=fill_)
-        elif transform_id == "Brightness":
-            return F.adjust_brightness(image, brightness_factor=1.0 + magnitude)
-        elif transform_id == "Color":
-            return F.adjust_saturation(image, saturation_factor=1.0 + magnitude)
-        elif transform_id == "Contrast":
-            return F.adjust_contrast(image, contrast_factor=1.0 + magnitude)
-        elif transform_id == "Sharpness":
-            return F.adjust_sharpness(image, sharpness_factor=1.0 + magnitude)
-        elif transform_id == "Posterize":
-            return F.posterize(image, bits=int(magnitude))
-        elif transform_id == "Solarize":
-            return F.solarize(image, threshold=magnitude)
-        elif transform_id == "AutoContrast":
-            return F.autocontrast(image)
-        elif transform_id == "Equalize":
-            return F.equalize(image)
-        elif transform_id == "Invert":
-            return F.invert(image)
-        else:
-            raise ValueError(f"No transform available for {transform_id}")
-
-
-class AutoAugment(_AutoAugmentBase):
-    _AUGMENTATION_SPACE = {
-        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
-        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
-        "TranslateX": (
-            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * width, num_bins),
-            True,
-        ),
-        "TranslateY": (
-            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * height, num_bins),
-            True,
-        ),
-        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
-        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
-            False,
-        ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
-        "AutoContrast": (lambda num_bins, height, width: None, False),
-        "Equalize": (lambda num_bins, height, width: None, False),
-        "Invert": (lambda num_bins, height, width: None, False),
-    }
-
-    def __init__(
-        self,
-        policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
-    ) -> None:
-        super().__init__(interpolation=interpolation, fill=fill)
-        self.policy = policy
-        self._policies = self._get_policies(policy)
-
-    def _get_policies(
-        self, policy: AutoAugmentPolicy
-    ) -> List[Tuple[Tuple[str, float, Optional[int]], Tuple[str, float, Optional[int]]]]:
-        if policy == AutoAugmentPolicy.IMAGENET:
-            return [
-                (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
-                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
-                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
-                (("Posterize", 0.6, 7), ("Posterize", 0.6, 6)),
-                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
-                (("Equalize", 0.4, None), ("Rotate", 0.8, 8)),
-                (("Solarize", 0.6, 3), ("Equalize", 0.6, None)),
-                (("Posterize", 0.8, 5), ("Equalize", 1.0, None)),
-                (("Rotate", 0.2, 3), ("Solarize", 0.6, 8)),
-                (("Equalize", 0.6, None), ("Posterize", 0.4, 6)),
-                (("Rotate", 0.8, 8), ("Color", 0.4, 0)),
-                (("Rotate", 0.4, 9), ("Equalize", 0.6, None)),
-                (("Equalize", 0.0, None), ("Equalize", 0.8, None)),
-                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
-                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
-                (("Rotate", 0.8, 8), ("Color", 1.0, 2)),
-                (("Color", 0.8, 8), ("Solarize", 0.8, 7)),
-                (("Sharpness", 0.4, 7), ("Invert", 0.6, None)),
-                (("ShearX", 0.6, 5), ("Equalize", 1.0, None)),
-                (("Color", 0.4, 0), ("Equalize", 0.6, None)),
-                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
-                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
-                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
-                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
-                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
-            ]
-        elif policy == AutoAugmentPolicy.CIFAR10:
-            return [
-                (("Invert", 0.1, None), ("Contrast", 0.2, 6)),
-                (("Rotate", 0.7, 2), ("TranslateX", 0.3, 9)),
-                (("Sharpness", 0.8, 1), ("Sharpness", 0.9, 3)),
-                (("ShearY", 0.5, 8), ("TranslateY", 0.7, 9)),
-                (("AutoContrast", 0.5, None), ("Equalize", 0.9, None)),
-                (("ShearY", 0.2, 7), ("Posterize", 0.3, 7)),
-                (("Color", 0.4, 3), ("Brightness", 0.6, 7)),
-                (("Sharpness", 0.3, 9), ("Brightness", 0.7, 9)),
-                (("Equalize", 0.6, None), ("Equalize", 0.5, None)),
-                (("Contrast", 0.6, 7), ("Sharpness", 0.6, 5)),
-                (("Color", 0.7, 7), ("TranslateX", 0.5, 8)),
-                (("Equalize", 0.3, None), ("AutoContrast", 0.4, None)),
-                (("TranslateY", 0.4, 3), ("Sharpness", 0.2, 6)),
-                (("Brightness", 0.9, 6), ("Color", 0.2, 8)),
-                (("Solarize", 0.5, 2), ("Invert", 0.0, None)),
-                (("Equalize", 0.2, None), ("AutoContrast", 0.6, None)),
-                (("Equalize", 0.2, None), ("Equalize", 0.6, None)),
-                (("Color", 0.9, 9), ("Equalize", 0.6, None)),
-                (("AutoContrast", 0.8, None), ("Solarize", 0.2, 8)),
-                (("Brightness", 0.1, 3), ("Color", 0.7, 0)),
-                (("Solarize", 0.4, 5), ("AutoContrast", 0.9, None)),
-                (("TranslateY", 0.9, 9), ("TranslateY", 0.7, 9)),
-                (("AutoContrast", 0.9, None), ("Solarize", 0.8, 3)),
-                (("Equalize", 0.8, None), ("Invert", 0.1, None)),
-                (("TranslateY", 0.7, 9), ("AutoContrast", 0.9, None)),
-            ]
-        elif policy == AutoAugmentPolicy.SVHN:
-            return [
-                (("ShearX", 0.9, 4), ("Invert", 0.2, None)),
-                (("ShearY", 0.9, 8), ("Invert", 0.7, None)),
-                (("Equalize", 0.6, None), ("Solarize", 0.6, 6)),
-                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
-                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
-                (("ShearX", 0.9, 4), ("AutoContrast", 0.8, None)),
-                (("ShearY", 0.9, 8), ("Invert", 0.4, None)),
-                (("ShearY", 0.9, 5), ("Solarize", 0.2, 6)),
-                (("Invert", 0.9, None), ("AutoContrast", 0.8, None)),
-                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
-                (("ShearX", 0.9, 4), ("Solarize", 0.3, 3)),
-                (("ShearY", 0.8, 8), ("Invert", 0.7, None)),
-                (("Equalize", 0.9, None), ("TranslateY", 0.6, 6)),
-                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
-                (("Contrast", 0.3, 3), ("Rotate", 0.8, 4)),
-                (("Invert", 0.8, None), ("TranslateY", 0.0, 2)),
-                (("ShearY", 0.7, 6), ("Solarize", 0.4, 8)),
-                (("Invert", 0.6, None), ("Rotate", 0.8, 4)),
-                (("ShearY", 0.3, 7), ("TranslateX", 0.9, 3)),
-                (("ShearX", 0.1, 6), ("Invert", 0.6, None)),
-                (("Solarize", 0.7, 2), ("TranslateY", 0.6, 7)),
-                (("ShearY", 0.8, 4), ("Invert", 0.8, None)),
-                (("ShearX", 0.7, 9), ("TranslateY", 0.8, 3)),
-                (("ShearY", 0.8, 5), ("AutoContrast", 0.7, None)),
-                (("ShearX", 0.7, 2), ("Invert", 0.1, None)),
-            ]
-        else:
-            raise ValueError(f"The provided policy {policy} is not recognized.")
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        id, image = self._extract_image(sample)
-        _, height, width = get_chw(image)
-
-        policy = self._policies[int(torch.randint(len(self._policies), ()))]
-
-        for transform_id, probability, magnitude_idx in policy:
-            if not torch.rand(()) <= probability:
-                continue
-
-            magnitudes_fn, signed = self._AUGMENTATION_SPACE[transform_id]
-
-            magnitudes = magnitudes_fn(10, height, width)
-            if magnitudes is not None:
-                magnitude = float(magnitudes[magnitude_idx])
-                if signed and torch.rand(()) <= 0.5:
-                    magnitude *= -1
-            else:
-                magnitude = 0.0
-
-            image = self._apply_image_transform(
-                image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
-            )
-
-        return self._put_into_sample(sample, id, image)
-
-
-class RandAugment(_AutoAugmentBase):
-    _AUGMENTATION_SPACE = {
-        "Identity": (lambda num_bins, height, width: None, False),
-        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
-        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
-        "TranslateX": (
-            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * width, num_bins),
-            True,
-        ),
-        "TranslateY": (
-            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * height, num_bins),
-            True,
-        ),
-        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
-        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
-            False,
-        ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
-        "AutoContrast": (lambda num_bins, height, width: None, False),
-        "Equalize": (lambda num_bins, height, width: None, False),
-    }
-
-    def __init__(
-        self,
-        num_ops: int = 2,
-        magnitude: int = 9,
-        num_magnitude_bins: int = 31,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
-    ) -> None:
-        super().__init__(interpolation=interpolation, fill=fill)
-        self.num_ops = num_ops
-        self.magnitude = magnitude
-        self.num_magnitude_bins = num_magnitude_bins
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        id, image = self._extract_image(sample)
-        _, height, width = get_chw(image)
-
-        for _ in range(self.num_ops):
-            transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
-            magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
-            if magnitudes is not None:
-                magnitude = float(magnitudes[self.magnitude])
-                if signed and torch.rand(()) <= 0.5:
-                    magnitude *= -1
-            else:
-                magnitude = 0.0
-            image = self._apply_image_transform(
-                image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
-            )
-
-        return self._put_into_sample(sample, id, image)
-
-
-class TrivialAugmentWide(_AutoAugmentBase):
-    _AUGMENTATION_SPACE = {
-        "Identity": (lambda num_bins, height, width: None, False),
-        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
-        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
-        "TranslateX": (lambda num_bins, height, width: torch.linspace(0.0, 32.0, num_bins), True),
-        "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, 32.0, num_bins), True),
-        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 135.0, num_bins), True),
-        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
-        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
-        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
-        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
-        "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)))
-            .round()
-            .int(),
-            False,
-        ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
-        "AutoContrast": (lambda num_bins, height, width: None, False),
-        "Equalize": (lambda num_bins, height, width: None, False),
-    }
-
-    def __init__(
-        self,
-        num_magnitude_bins: int = 31,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
-    ):
-        super().__init__(interpolation=interpolation, fill=fill)
-        self.num_magnitude_bins = num_magnitude_bins
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        id, image = self._extract_image(sample)
-        _, height, width = get_chw(image)
-
-        transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
-
-        magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
-        if magnitudes is not None:
-            magnitude = float(magnitudes[int(torch.randint(self.num_magnitude_bins, ()))])
-            if signed and torch.rand(()) <= 0.5:
-                magnitude *= -1
-        else:
-            magnitude = 0.0
-
-        image = self._apply_image_transform(
-            image, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
-        )
-        return self._put_into_sample(sample, id, image)
-
-
-class AugMix(_AutoAugmentBase):
-    _PARTIAL_AUGMENTATION_SPACE = {
-        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
-        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
-        "TranslateX": (lambda num_bins, height, width: torch.linspace(0.0, width / 3.0, num_bins), True),
-        "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, height / 3.0, num_bins), True),
-        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
-        "Posterize": (
-            lambda num_bins, height, width: cast(torch.Tensor, 4 - (torch.arange(num_bins) / ((num_bins - 1) / 4)))
-            .round()
-            .int(),
-            False,
-        ),
-        "Solarize": (lambda num_bins, height, width: torch.linspace(255.0, 0.0, num_bins), False),
-        "AutoContrast": (lambda num_bins, height, width: None, False),
-        "Equalize": (lambda num_bins, height, width: None, False),
-    }
-    _AUGMENTATION_SPACE: Dict[str, Tuple[Callable[[int, int, int], Optional[torch.Tensor]], bool]] = {
-        **_PARTIAL_AUGMENTATION_SPACE,
-        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
-    }
-
-    def __init__(
-        self,
-        severity: int = 3,
-        mixture_width: int = 3,
-        chain_depth: int = -1,
-        alpha: float = 1.0,
-        all_ops: bool = True,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = None,
-    ) -> None:
-        super().__init__(interpolation=interpolation, fill=fill)
-        self._PARAMETER_MAX = 10
-        if not (1 <= severity <= self._PARAMETER_MAX):
-            raise ValueError(f"The severity must be between [1, {self._PARAMETER_MAX}]. Got {severity} instead.")
-        self.severity = severity
-        self.mixture_width = mixture_width
-        self.chain_depth = chain_depth
-        self.alpha = alpha
-        self.all_ops = all_ops
-
-    def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor:
-        # Must be on a separate method so that we can overwrite it in tests.
-        return torch._sample_dirichlet(params)
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        id, orig_image = self._extract_image(sample)
-        _, height, width = get_chw(orig_image)
-
-        if isinstance(orig_image, torch.Tensor):
-            image = orig_image
-        else:  # isinstance(inpt, PIL.Image.Image):
-            image = F.pil_to_tensor(orig_image)
-
-        augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
-
-        orig_dims = list(image.shape)
-        batch = image.view([1] * max(4 - image.ndim, 0) + orig_dims)
-        batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
-
-        # Sample the beta weights for combining the original and augmented image. To get Beta, we use a Dirichlet
-        # with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of augmented image.
-        m = self._sample_dirichlet(
-            torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1)
-        )
-
-        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images.
-        combined_weights = self._sample_dirichlet(
-            torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
-        ) * m[:, 1].view([batch_dims[0], -1])
-
-        mix = m[:, 0].view(batch_dims) * batch
-        for i in range(self.mixture_width):
-            aug = batch
-            depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item())
-            for _ in range(depth):
-                transform_id, (magnitudes_fn, signed) = self._get_random_item(augmentation_space)
-
-                magnitudes = magnitudes_fn(self._PARAMETER_MAX, height, width)
-                if magnitudes is not None:
-                    magnitude = float(magnitudes[int(torch.randint(self.severity, ()))])
-                    if signed and torch.rand(()) <= 0.5:
-                        magnitude *= -1
-                else:
-                    magnitude = 0.0
-
-                aug = self._apply_image_transform(
-                    aug, transform_id, magnitude, interpolation=self.interpolation, fill=self.fill
-                )
-            mix.add_(combined_weights[:, i].view(batch_dims) * aug)
-        mix = mix.view(orig_dims).to(dtype=image.dtype)
-
-        if isinstance(orig_image, features.Image):
-            mix = features.Image.new_like(orig_image, mix)
-        elif isinstance(orig_image, PIL.Image.Image):
-            mix = F.to_image_pil(mix)
-
-        return self._put_into_sample(sample, id, mix)
diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py
deleted file mode 100644
index e0ee8d1b96a..00000000000
--- a/torchvision/prototype/transforms/_color.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import collections.abc
-from typing import Any, Dict, Optional, Sequence, Tuple, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-
-from ._transform import _RandomApplyTransform
-from ._utils import query_chw
-
-
-class ColorJitter(Transform):
-    def __init__(
-        self,
-        brightness: Optional[Union[float, Sequence[float]]] = None,
-        contrast: Optional[Union[float, Sequence[float]]] = None,
-        saturation: Optional[Union[float, Sequence[float]]] = None,
-        hue: Optional[Union[float, Sequence[float]]] = None,
-    ) -> None:
-        super().__init__()
-        self.brightness = self._check_input(brightness, "brightness")
-        self.contrast = self._check_input(contrast, "contrast")
-        self.saturation = self._check_input(saturation, "saturation")
-        self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
-
-    def _check_input(
-        self,
-        value: Optional[Union[float, Sequence[float]]],
-        name: str,
-        center: float = 1.0,
-        bound: Tuple[float, float] = (0, float("inf")),
-        clip_first_on_zero: bool = True,
-    ) -> Optional[Tuple[float, float]]:
-        if value is None:
-            return None
-
-        if isinstance(value, float):
-            if value < 0:
-                raise ValueError(f"If {name} is a single number, it must be non negative.")
-            value = [center - value, center + value]
-            if clip_first_on_zero:
-                value[0] = max(value[0], 0.0)
-        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
-            if not bound[0] <= value[0] <= value[1] <= bound[1]:
-                raise ValueError(f"{name} values should be between {bound}")
-        else:
-            raise TypeError(f"{name} should be a single number or a sequence with length 2.")
-
-        return None if value[0] == value[1] == center else (float(value[0]), float(value[1]))
-
-    @staticmethod
-    def _generate_value(left: float, right: float) -> float:
-        return float(torch.distributions.Uniform(left, right).sample())
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        fn_idx = torch.randperm(4)
-
-        b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1])
-        c = None if self.contrast is None else self._generate_value(self.contrast[0], self.contrast[1])
-        s = None if self.saturation is None else self._generate_value(self.saturation[0], self.saturation[1])
-        h = None if self.hue is None else self._generate_value(self.hue[0], self.hue[1])
-
-        return dict(fn_idx=fn_idx, brightness_factor=b, contrast_factor=c, saturation_factor=s, hue_factor=h)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        output = inpt
-        brightness_factor = params["brightness_factor"]
-        contrast_factor = params["contrast_factor"]
-        saturation_factor = params["saturation_factor"]
-        hue_factor = params["hue_factor"]
-        for fn_id in params["fn_idx"]:
-            if fn_id == 0 and brightness_factor is not None:
-                output = F.adjust_brightness(output, brightness_factor=brightness_factor)
-            elif fn_id == 1 and contrast_factor is not None:
-                output = F.adjust_contrast(output, contrast_factor=contrast_factor)
-            elif fn_id == 2 and saturation_factor is not None:
-                output = F.adjust_saturation(output, saturation_factor=saturation_factor)
-            elif fn_id == 3 and hue_factor is not None:
-                output = F.adjust_hue(output, hue_factor=hue_factor)
-        return output
-
-
-class RandomPhotometricDistort(Transform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
-
-    def __init__(
-        self,
-        contrast: Tuple[float, float] = (0.5, 1.5),
-        saturation: Tuple[float, float] = (0.5, 1.5),
-        hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
-        p: float = 0.5,
-    ):
-        super().__init__()
-        self.brightness = brightness
-        self.contrast = contrast
-        self.hue = hue
-        self.saturation = saturation
-        self.p = p
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_channels, _, _ = query_chw(sample)
-        return dict(
-            zip(
-                ["brightness", "contrast1", "saturation", "hue", "contrast2"],
-                (torch.rand(5) < self.p).tolist(),
-            ),
-            contrast_before=bool(torch.rand(()) < 0.5),
-            channel_permutation=torch.randperm(num_channels) if torch.rand(()) < self.p else None,
-        )
-
-    def _permute_channels(self, inpt: features.ImageType, permutation: torch.Tensor) -> features.ImageType:
-        if isinstance(inpt, PIL.Image.Image):
-            inpt = F.pil_to_tensor(inpt)
-
-        output = inpt[..., permutation, :, :]
-
-        if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.OTHER)
-        elif isinstance(inpt, PIL.Image.Image):
-            output = F.to_image_pil(output)
-
-        return output
-
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
-        if params["brightness"]:
-            inpt = F.adjust_brightness(
-                inpt, brightness_factor=ColorJitter._generate_value(self.brightness[0], self.brightness[1])
-            )
-        if params["contrast1"] and params["contrast_before"]:
-            inpt = F.adjust_contrast(
-                inpt, contrast_factor=ColorJitter._generate_value(self.contrast[0], self.contrast[1])
-            )
-        if params["saturation"]:
-            inpt = F.adjust_saturation(
-                inpt, saturation_factor=ColorJitter._generate_value(self.saturation[0], self.saturation[1])
-            )
-        if params["hue"]:
-            inpt = F.adjust_hue(inpt, hue_factor=ColorJitter._generate_value(self.hue[0], self.hue[1]))
-        if params["contrast2"] and not params["contrast_before"]:
-            inpt = F.adjust_contrast(
-                inpt, contrast_factor=ColorJitter._generate_value(self.contrast[0], self.contrast[1])
-            )
-        if params["channel_permutation"] is not None:
-            inpt = self._permute_channels(inpt, permutation=params["channel_permutation"])
-        return inpt
-
-
-class RandomEqualize(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.equalize(inpt)
-
-
-class RandomInvert(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.invert(inpt)
-
-
-class RandomPosterize(_RandomApplyTransform):
-    def __init__(self, bits: int, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.bits = bits
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.posterize(inpt, bits=self.bits)
-
-
-class RandomSolarize(_RandomApplyTransform):
-    def __init__(self, threshold: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.threshold = threshold
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.solarize(inpt, threshold=self.threshold)
-
-
-class RandomAutocontrast(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.autocontrast(inpt)
-
-
-class RandomAdjustSharpness(_RandomApplyTransform):
-    def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
-        super().__init__(p=p)
-        self.sharpness_factor = sharpness_factor
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.adjust_sharpness(inpt, sharpness_factor=self.sharpness_factor)
diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/prototype/transforms/_container.py
deleted file mode 100644
index b65f18cd3e7..00000000000
--- a/torchvision/prototype/transforms/_container.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import warnings
-from typing import Any, Callable, List, Optional, Sequence
-
-import torch
-from torchvision.prototype.transforms import Transform
-
-
-class Compose(Transform):
-    def __init__(self, transforms: Sequence[Callable]) -> None:
-        super().__init__()
-        if not isinstance(transforms, Sequence):
-            raise TypeError("Argument transforms should be a sequence of callables")
-        self.transforms = transforms
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        for transform in self.transforms:
-            sample = transform(sample)
-        return sample
-
-    def extra_repr(self) -> str:
-        format_string = []
-        for t in self.transforms:
-            format_string.append(f"    {t}")
-        return "\n".join(format_string)
-
-
-class RandomApply(Compose):
-    def __init__(self, transforms: Sequence[Callable], p: float = 0.5) -> None:
-        super().__init__(transforms)
-
-        if not (0.0 <= p <= 1.0):
-            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
-        self.p = p
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        if torch.rand(1) >= self.p:
-            return sample
-
-        return super().forward(sample)
-
-
-class RandomChoice(Transform):
-    def __init__(
-        self,
-        transforms: Sequence[Callable],
-        probabilities: Optional[List[float]] = None,
-        p: Optional[List[float]] = None,
-    ) -> None:
-        if not isinstance(transforms, Sequence):
-            raise TypeError("Argument transforms should be a sequence of callables")
-        if p is not None:
-            warnings.warn(
-                "Argument p is deprecated and will be removed in a future release. "
-                "Please use probabilities argument instead."
-            )
-            probabilities = p
-
-        if probabilities is None:
-            probabilities = [1] * len(transforms)
-        elif len(probabilities) != len(transforms):
-            raise ValueError(
-                f"The number of probabilities doesn't match the number of transforms: "
-                f"{len(probabilities)} != {len(transforms)}"
-            )
-
-        super().__init__()
-
-        self.transforms = transforms
-        total = sum(probabilities)
-        self.probabilities = [prob / total for prob in probabilities]
-
-    def forward(self, *inputs: Any) -> Any:
-        idx = int(torch.multinomial(torch.tensor(self.probabilities), 1))
-        transform = self.transforms[idx]
-        return transform(*inputs)
-
-
-class RandomOrder(Transform):
-    def __init__(self, transforms: Sequence[Callable]) -> None:
-        if not isinstance(transforms, Sequence):
-            raise TypeError("Argument transforms should be a sequence of callables")
-        super().__init__()
-        self.transforms = transforms
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-        for idx in torch.randperm(len(self.transforms)):
-            transform = self.transforms[idx]
-            sample = transform(sample)
-        return sample
diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py
deleted file mode 100644
index a9341415c1a..00000000000
--- a/torchvision/prototype/transforms/_deprecated.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import warnings
-from typing import Any, Dict, Union
-
-import numpy as np
-import PIL.Image
-import torch
-
-from torchvision.prototype import features
-from torchvision.prototype.transforms import Transform
-from torchvision.transforms import functional as _F
-from typing_extensions import Literal
-
-from ._transform import _RandomApplyTransform
-from ._utils import query_chw
-
-
-class ToTensor(Transform):
-    _transformed_types = (PIL.Image.Image, np.ndarray)
-
-    def __init__(self) -> None:
-        warnings.warn(
-            "The transform `ToTensor()` is deprecated and will be removed in a future release. "
-            "Instead, please use `transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])`."
-        )
-        super().__init__()
-
-    def _transform(self, inpt: Union[PIL.Image.Image, np.ndarray], params: Dict[str, Any]) -> torch.Tensor:
-        return _F.to_tensor(inpt)
-
-
-class Grayscale(Transform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
-
-    def __init__(self, num_output_channels: Literal[1, 3] = 1) -> None:
-        deprecation_msg = (
-            f"The transform `Grayscale(num_output_channels={num_output_channels})` "
-            f"is deprecated and will be removed in a future release."
-        )
-        if num_output_channels == 1:
-            replacement_msg = (
-                "transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY)"
-            )
-        else:
-            replacement_msg = (
-                "transforms.Compose(\n"
-                "    transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY),\n"
-                "    transforms.ConvertImageColorSpace(old_color_space=ColorSpace.GRAY, color_space=ColorSpace.RGB),\n"
-                ")"
-            )
-        warnings.warn(f"{deprecation_msg} Instead, please use\n\n{replacement_msg}")
-
-        super().__init__()
-        self.num_output_channels = num_output_channels
-
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
-        output = _F.rgb_to_grayscale(inpt, num_output_channels=self.num_output_channels)
-        if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.GRAY)
-        return output
-
-
-class RandomGrayscale(_RandomApplyTransform):
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
-
-    def __init__(self, p: float = 0.1) -> None:
-        warnings.warn(
-            "The transform `RandomGrayscale(p=...)` is deprecated and will be removed in a future release. "
-            "Instead, please use\n\n"
-            "transforms.RandomApply(\n"
-            "    transforms.Compose(\n"
-            "        transforms.ConvertImageColorSpace(old_color_space=ColorSpace.RGB, color_space=ColorSpace.GRAY),\n"
-            "        transforms.ConvertImageColorSpace(old_color_space=ColorSpace.GRAY, color_space=ColorSpace.RGB),\n"
-            "    )\n"
-            "    p=...,\n"
-            ")"
-        )
-
-        super().__init__(p=p)
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        num_input_channels, _, _ = query_chw(sample)
-        return dict(num_input_channels=num_input_channels)
-
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
-        output = _F.rgb_to_grayscale(inpt, num_output_channels=params["num_input_channels"])
-        if isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output, color_space=features.ColorSpace.GRAY)
-        return output
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
deleted file mode 100644
index 008d4d195cb..00000000000
--- a/torchvision/prototype/transforms/_geometry.py
+++ /dev/null
@@ -1,889 +0,0 @@
-import math
-import numbers
-import warnings
-from typing import Any, cast, Dict, List, Optional, Sequence, Tuple, Type, Union
-
-import PIL.Image
-import torch
-from torchvision.ops.boxes import box_iou
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform
-from torchvision.transforms.functional import _get_perspective_coeffs
-
-from typing_extensions import Literal
-
-from ._transform import _RandomApplyTransform
-from ._utils import (
-    _check_padding_arg,
-    _check_padding_mode_arg,
-    _check_sequence_input,
-    _setup_angle,
-    _setup_fill_arg,
-    _setup_float_or_seq,
-    _setup_size,
-    has_all,
-    has_any,
-    query_bounding_box,
-    query_chw,
-)
-
-
-class RandomHorizontalFlip(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.horizontal_flip(inpt)
-
-
-class RandomVerticalFlip(_RandomApplyTransform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.vertical_flip(inpt)
-
-
-class Resize(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        max_size: Optional[int] = None,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__()
-
-        self.size = (
-            [size]
-            if isinstance(size, int)
-            else _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-        )
-        self.interpolation = interpolation
-        self.max_size = max_size
-        self.antialias = antialias
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(
-            inpt,
-            self.size,
-            interpolation=self.interpolation,
-            max_size=self.max_size,
-            antialias=self.antialias,
-        )
-
-
-class CenterCrop(Transform):
-    def __init__(self, size: Union[int, Sequence[int]]):
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.center_crop(inpt, output_size=self.size)
-
-
-class RandomResizedCrop(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        scale: Tuple[float, float] = (0.08, 1.0),
-        ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-        if not isinstance(scale, Sequence):
-            raise TypeError("Scale should be a sequence")
-        scale = cast(Tuple[float, float], scale)
-        if not isinstance(ratio, Sequence):
-            raise TypeError("Ratio should be a sequence")
-        ratio = cast(Tuple[float, float], ratio)
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("Scale and ratio should be of kind (min, max)")
-
-        self.scale = scale
-        self.ratio = ratio
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-        self._log_ratio = torch.log(torch.tensor(self.ratio))
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # vfdev-5: techically, this op can work on bboxes/segm masks only inputs without image in samples
-        # What if we have multiple images/bboxes/masks of different sizes ?
-        # TODO: let's support bbox or mask in samples without image
-        _, height, width = query_chw(sample)
-        area = height * width
-
-        log_ratio = self._log_ratio
-        for _ in range(10):
-            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
-            aspect_ratio = torch.exp(
-                torch.empty(1).uniform_(
-                    log_ratio[0],  # type: ignore[arg-type]
-                    log_ratio[1],  # type: ignore[arg-type]
-                )
-            ).item()
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if 0 < w <= width and 0 < h <= height:
-                i = torch.randint(0, height - h + 1, size=(1,)).item()
-                j = torch.randint(0, width - w + 1, size=(1,)).item()
-                break
-        else:
-            # Fallback to central crop
-            in_ratio = float(width) / float(height)
-            if in_ratio < min(self.ratio):
-                w = width
-                h = int(round(w / min(self.ratio)))
-            elif in_ratio > max(self.ratio):
-                h = height
-                w = int(round(h * max(self.ratio)))
-            else:  # whole image
-                w = width
-                h = height
-            i = (height - h) // 2
-            j = (width - w) // 2
-
-        return dict(top=i, left=j, height=h, width=w)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resized_crop(
-            inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
-        )
-
-
-class FiveCrop(Transform):
-    """
-    Example:
-        >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[features.Image, ...], features.Label]):
-        ...         images, labels = sample
-        ...         batch_size = len(images)
-        ...         images = features.Image.new_like(images[0], torch.stack(images))
-        ...         labels = features.Label.new_like(labels, labels.repeat(batch_size))
-        ...         return images, labels
-        ...
-        >>> image = features.Image(torch.rand(3, 256, 256))
-        >>> label = features.Label(0)
-        >>> transform = transforms.Compose([transforms.FiveCrop(), BatchMultiCrop()])
-        >>> images, labels = transform(image, label)
-        >>> images.shape
-        torch.Size([5, 3, 224, 224])
-        >>> labels.shape
-        torch.Size([5])
-    """
-
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
-
-    def __init__(self, size: Union[int, Sequence[int]]) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-    def _transform(
-        self, inpt: features.ImageType, params: Dict[str, Any]
-    ) -> Tuple[features.ImageType, features.ImageType, features.ImageType, features.ImageType, features.ImageType]:
-        return F.five_crop(inpt, self.size)
-
-    def forward(self, *inputs: Any) -> Any:
-        if has_any(inputs, features.BoundingBox, features.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
-        return super().forward(*inputs)
-
-
-class TenCrop(Transform):
-    """
-    See :class:`~torchvision.prototype.transforms.FiveCrop` for an example.
-    """
-
-    _transformed_types = (features.Image, PIL.Image.Image, features.is_simple_tensor)
-
-    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
-        super().__init__()
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-        self.vertical_flip = vertical_flip
-
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> List[features.ImageType]:
-        return F.ten_crop(inpt, self.size, vertical_flip=self.vertical_flip)
-
-    def forward(self, *inputs: Any) -> Any:
-        if has_any(inputs, features.BoundingBox, features.Mask):
-            raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
-        return super().forward(*inputs)
-
-
-class Pad(Transform):
-    def __init__(
-        self,
-        padding: Union[int, Sequence[int]],
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-    ) -> None:
-        super().__init__()
-
-        _check_padding_arg(padding)
-        _check_padding_mode_arg(padding_mode)
-
-        self.padding = padding
-        self.fill = _setup_fill_arg(fill)
-        self.padding_mode = padding_mode
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        padding = self.padding
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)
-
-
-class RandomZoomOut(_RandomApplyTransform):
-    def __init__(
-        self,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        side_range: Sequence[float] = (1.0, 4.0),
-        p: float = 0.5,
-    ) -> None:
-        super().__init__(p=p)
-
-        self.fill = _setup_fill_arg(fill)
-
-        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
-
-        self.side_range = side_range
-        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
-            raise ValueError(f"Invalid canvas side range provided {side_range}.")
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_h, orig_w = query_chw(sample)
-
-        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
-        canvas_width = int(orig_w * r)
-        canvas_height = int(orig_h * r)
-
-        r = torch.rand(2)
-        left = int((canvas_width - orig_w) * r[0])
-        top = int((canvas_height - orig_h) * r[1])
-        right = canvas_width - (left + orig_w)
-        bottom = canvas_height - (top + orig_h)
-        padding = [left, top, right, bottom]
-
-        return dict(padding=padding)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.pad(inpt, **params, fill=fill)
-
-
-class RandomRotation(Transform):
-    def __init__(
-        self,
-        degrees: Union[numbers.Number, Sequence],
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        expand: bool = False,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        center: Optional[List[float]] = None,
-    ) -> None:
-        super().__init__()
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        self.interpolation = interpolation
-        self.expand = expand
-
-        self.fill = _setup_fill_arg(fill)
-
-        if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2,))
-
-        self.center = center
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
-        return dict(angle=angle)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.rotate(
-            inpt,
-            **params,
-            interpolation=self.interpolation,
-            expand=self.expand,
-            fill=fill,
-            center=self.center,
-        )
-
-
-class RandomAffine(Transform):
-    def __init__(
-        self,
-        degrees: Union[numbers.Number, Sequence],
-        translate: Optional[Sequence[float]] = None,
-        scale: Optional[Sequence[float]] = None,
-        shear: Optional[Union[int, float, Sequence[float]]] = None,
-        interpolation: InterpolationMode = InterpolationMode.NEAREST,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        center: Optional[List[float]] = None,
-    ) -> None:
-        super().__init__()
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
-        if translate is not None:
-            _check_sequence_input(translate, "translate", req_sizes=(2,))
-            for t in translate:
-                if not (0.0 <= t <= 1.0):
-                    raise ValueError("translation values should be between 0 and 1")
-        self.translate = translate
-        if scale is not None:
-            _check_sequence_input(scale, "scale", req_sizes=(2,))
-            for s in scale:
-                if s <= 0:
-                    raise ValueError("scale values should be positive")
-        self.scale = scale
-
-        if shear is not None:
-            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
-        else:
-            self.shear = shear
-
-        self.interpolation = interpolation
-        self.fill = _setup_fill_arg(fill)
-
-        if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2,))
-
-        self.center = center
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, height, width = query_chw(sample)
-
-        angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item())
-        if self.translate is not None:
-            max_dx = float(self.translate[0] * width)
-            max_dy = float(self.translate[1] * height)
-            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
-            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
-            translate = (tx, ty)
-        else:
-            translate = (0, 0)
-
-        if self.scale is not None:
-            scale = float(torch.empty(1).uniform_(self.scale[0], self.scale[1]).item())
-        else:
-            scale = 1.0
-
-        shear_x = shear_y = 0.0
-        if self.shear is not None:
-            shear_x = float(torch.empty(1).uniform_(self.shear[0], self.shear[1]).item())
-            if len(self.shear) == 4:
-                shear_y = float(torch.empty(1).uniform_(self.shear[2], self.shear[3]).item())
-
-        shear = (shear_x, shear_y)
-        return dict(angle=angle, translate=translate, scale=scale, shear=shear)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.affine(
-            inpt,
-            **params,
-            interpolation=self.interpolation,
-            fill=fill,
-            center=self.center,
-        )
-
-
-class RandomCrop(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        padding: Optional[Union[int, Sequence[int]]] = None,
-        pad_if_needed: bool = False,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-    ) -> None:
-        super().__init__()
-
-        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
-
-        if pad_if_needed or padding is not None:
-            if padding is not None:
-                _check_padding_arg(padding)
-            _check_padding_mode_arg(padding_mode)
-
-        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
-        self.pad_if_needed = pad_if_needed
-        self.fill = _setup_fill_arg(fill)
-        self.padding_mode = padding_mode
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, padded_height, padded_width = query_chw(sample)
-
-        if self.padding is not None:
-            pad_left, pad_right, pad_top, pad_bottom = self.padding
-            padded_height += pad_top + pad_bottom
-            padded_width += pad_left + pad_right
-        else:
-            pad_left = pad_right = pad_top = pad_bottom = 0
-
-        cropped_height, cropped_width = self.size
-
-        if self.pad_if_needed:
-            if padded_height < cropped_height:
-                diff = cropped_height - padded_height
-
-                pad_top += diff
-                pad_bottom += diff
-                padded_height += 2 * diff
-
-            if padded_width < cropped_width:
-                diff = cropped_width - padded_width
-
-                pad_left += diff
-                pad_right += diff
-                padded_width += 2 * diff
-
-        if padded_height < cropped_height or padded_width < cropped_width:
-            raise ValueError(
-                f"Required crop size {(cropped_height, cropped_width)} is larger than "
-                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
-            )
-
-        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-        needs_pad = any(padding)
-
-        needs_vert_crop, top = (
-            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
-            if padded_height > cropped_height
-            else (False, 0)
-        )
-        needs_horz_crop, left = (
-            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
-            if padded_width > cropped_width
-            else (False, 0)
-        )
-
-        return dict(
-            needs_crop=needs_vert_crop or needs_horz_crop,
-            top=top,
-            left=left,
-            height=cropped_height,
-            width=cropped_width,
-            needs_pad=needs_pad,
-            padding=padding,
-        )
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if params["needs_pad"]:
-            fill = self.fill[type(inpt)]
-            fill = F._geometry._convert_fill_arg(fill)
-
-            inpt = F.pad(inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
-
-        if params["needs_crop"]:
-            inpt = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
-
-        return inpt
-
-
-class RandomPerspective(_RandomApplyTransform):
-    def __init__(
-        self,
-        distortion_scale: float = 0.5,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        p: float = 0.5,
-    ) -> None:
-        super().__init__(p=p)
-
-        if not (0 <= distortion_scale <= 1):
-            raise ValueError("Argument distortion_scale value should be between 0 and 1")
-
-        self.distortion_scale = distortion_scale
-        self.interpolation = interpolation
-        self.fill = _setup_fill_arg(fill)
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, height, width = query_chw(sample)
-
-        distortion_scale = self.distortion_scale
-
-        half_height = height // 2
-        half_width = width // 2
-        topleft = [
-            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
-            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
-        ]
-        topright = [
-            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
-            int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()),
-        ]
-        botright = [
-            int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()),
-            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
-        ]
-        botleft = [
-            int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()),
-            int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()),
-        ]
-        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
-        endpoints = [topleft, topright, botright, botleft]
-        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
-        return dict(perspective_coeffs=perspective_coeffs)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.perspective(
-            inpt,
-            **params,
-            fill=fill,
-            interpolation=self.interpolation,
-        )
-
-
-class ElasticTransform(Transform):
-    def __init__(
-        self,
-        alpha: Union[float, Sequence[float]] = 50.0,
-        sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ) -> None:
-        super().__init__()
-        self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
-
-        self.interpolation = interpolation
-        self.fill = _setup_fill_arg(fill)
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        # Get image size
-        # TODO: make it work with bboxes and segm masks
-        _, *size = query_chw(sample)
-
-        dx = torch.rand([1, 1] + size) * 2 - 1
-        if self.sigma[0] > 0.0:
-            kx = int(8 * self.sigma[0] + 1)
-            # if kernel size is even we have to make it odd
-            if kx % 2 == 0:
-                kx += 1
-            dx = F.gaussian_blur(dx, [kx, kx], list(self.sigma))
-        dx = dx * self.alpha[0] / size[0]
-
-        dy = torch.rand([1, 1] + size) * 2 - 1
-        if self.sigma[1] > 0.0:
-            ky = int(8 * self.sigma[1] + 1)
-            # if kernel size is even we have to make it odd
-            if ky % 2 == 0:
-                ky += 1
-            dy = F.gaussian_blur(dy, [ky, ky], list(self.sigma))
-        dy = dy * self.alpha[1] / size[1]
-        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
-        return dict(displacement=displacement)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        fill = self.fill[type(inpt)]
-        fill = F._geometry._convert_fill_arg(fill)
-        return F.elastic(
-            inpt,
-            **params,
-            fill=fill,
-            interpolation=self.interpolation,
-        )
-
-
-class RandomIoUCrop(Transform):
-    def __init__(
-        self,
-        min_scale: float = 0.3,
-        max_scale: float = 1.0,
-        min_aspect_ratio: float = 0.5,
-        max_aspect_ratio: float = 2.0,
-        sampler_options: Optional[List[float]] = None,
-        trials: int = 40,
-    ):
-        super().__init__()
-        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        if sampler_options is None:
-            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
-        self.options = sampler_options
-        self.trials = trials
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_h, orig_w = query_chw(sample)
-        bboxes = query_bounding_box(sample)
-
-        while True:
-            # sample an option
-            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
-            min_jaccard_overlap = self.options[idx]
-            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
-                return dict()
-
-            for _ in range(self.trials):
-                # check the aspect ratio limitations
-                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
-                new_w = int(orig_w * r[0])
-                new_h = int(orig_h * r[1])
-                aspect_ratio = new_w / new_h
-                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
-                    continue
-
-                # check for 0 area crops
-                r = torch.rand(2)
-                left = int((orig_w - new_w) * r[0])
-                top = int((orig_h - new_h) * r[1])
-                right = left + new_w
-                bottom = top + new_h
-                if left == right or top == bottom:
-                    continue
-
-                # check for any valid boxes with centers within the crop area
-                xyxy_bboxes = F.convert_format_bounding_box(
-                    bboxes, old_format=bboxes.format, new_format=features.BoundingBoxFormat.XYXY, copy=True
-                )
-                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
-                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
-                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
-                if not is_within_crop_area.any():
-                    continue
-
-                # check at least 1 box with jaccard limitations
-                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
-                ious = box_iou(
-                    xyxy_bboxes,
-                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
-                )
-                if ious.max() < min_jaccard_overlap:
-                    continue
-
-                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if len(params) < 1:
-            return inpt
-
-        is_within_crop_area = params["is_within_crop_area"]
-
-        if isinstance(inpt, (features.Label, features.OneHotLabel)):
-            return inpt.new_like(inpt, inpt[is_within_crop_area])  # type: ignore[arg-type]
-
-        output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
-
-        if isinstance(output, features.BoundingBox):
-            bboxes = output[is_within_crop_area]
-            bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size)
-            output = features.BoundingBox.new_like(output, bboxes)
-        elif isinstance(output, features.Mask):
-            # apply is_within_crop_area if mask is one-hot encoded
-            masks = output[is_within_crop_area]
-            output = features.Mask.new_like(output, masks)
-
-        return output
-
-    def forward(self, *inputs: Any) -> Any:
-        if not (
-            has_all(inputs, features.BoundingBox)
-            and has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor)
-            and has_any(inputs, features.Label, features.OneHotLabel)
-        ):
-            raise TypeError(
-                f"{type(self).__name__}() requires input sample to contain Images or PIL Images, "
-                "BoundingBoxes and Labels or OneHotLabels. Sample can also contain Masks."
-            )
-        return super().forward(*inputs)
-
-
-class ScaleJitter(Transform):
-    def __init__(
-        self,
-        target_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (0.1, 2.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ):
-        super().__init__()
-        self.target_size = target_size
-        self.scale_range = scale_range
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_height, orig_width = query_chw(sample)
-
-        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
-        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        return dict(size=(new_height, new_width))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
-
-
-class RandomShortestSize(Transform):
-    def __init__(
-        self,
-        min_size: Union[List[int], Tuple[int], int],
-        max_size: int,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ):
-        super().__init__()
-        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
-        self.max_size = max_size
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, orig_height, orig_width = query_chw(sample)
-
-        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
-        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
-
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        return dict(size=(new_height, new_width))
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias)
-
-
-class FixedSizeCrop(Transform):
-    def __init__(
-        self,
-        size: Union[int, Sequence[int]],
-        fill: Union[features.FillType, Dict[Type, features.FillType]] = 0,
-        padding_mode: str = "constant",
-    ) -> None:
-        super().__init__()
-        size = tuple(_setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
-        self.crop_height = size[0]
-        self.crop_width = size[1]
-
-        self.fill = _setup_fill_arg(fill)
-
-        self.padding_mode = padding_mode
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        _, height, width = query_chw(sample)
-        new_height = min(height, self.crop_height)
-        new_width = min(width, self.crop_width)
-
-        needs_crop = new_height != height or new_width != width
-
-        offset_height = max(height - self.crop_height, 0)
-        offset_width = max(width - self.crop_width, 0)
-
-        r = torch.rand(1)
-        top = int(offset_height * r)
-        left = int(offset_width * r)
-
-        try:
-            bounding_boxes = query_bounding_box(sample)
-        except ValueError:
-            bounding_boxes = None
-
-        if needs_crop and bounding_boxes is not None:
-            bounding_boxes = cast(
-                features.BoundingBox, F.crop(bounding_boxes, top=top, left=left, height=new_height, width=new_width)
-            )
-            bounding_boxes = features.BoundingBox.new_like(
-                bounding_boxes,
-                F.clamp_bounding_box(
-                    bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size
-                ),
-            )
-            height_and_width = bounding_boxes.to_format(features.BoundingBoxFormat.XYWH)[..., 2:]
-            is_valid = torch.all(height_and_width > 0, dim=-1)
-        else:
-            is_valid = None
-
-        pad_bottom = max(self.crop_height - new_height, 0)
-        pad_right = max(self.crop_width - new_width, 0)
-
-        needs_pad = pad_bottom != 0 or pad_right != 0
-
-        return dict(
-            needs_crop=needs_crop,
-            top=top,
-            left=left,
-            height=new_height,
-            width=new_width,
-            is_valid=is_valid,
-            padding=[0, 0, pad_right, pad_bottom],
-            needs_pad=needs_pad,
-        )
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if params["needs_crop"]:
-            inpt = F.crop(
-                inpt,
-                top=params["top"],
-                left=params["left"],
-                height=params["height"],
-                width=params["width"],
-            )
-
-        if params["is_valid"] is not None:
-            if isinstance(inpt, (features.Label, features.OneHotLabel, features.Mask)):
-                inpt = inpt.new_like(inpt, inpt[params["is_valid"]])  # type: ignore[arg-type]
-            elif isinstance(inpt, features.BoundingBox):
-                inpt = features.BoundingBox.new_like(
-                    inpt,
-                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size),
-                )
-
-        if params["needs_pad"]:
-            fill = self.fill[type(inpt)]
-            fill = F._geometry._convert_fill_arg(fill)
-            inpt = F.pad(inpt, params["padding"], fill=fill, padding_mode=self.padding_mode)
-
-        return inpt
-
-    def forward(self, *inputs: Any) -> Any:
-        if not has_any(inputs, PIL.Image.Image, features.Image, features.is_simple_tensor):
-            raise TypeError(f"{type(self).__name__}() requires input sample to contain an tensor or PIL image.")
-
-        if has_any(inputs, features.BoundingBox) and not has_any(inputs, features.Label, features.OneHotLabel):
-            raise TypeError(
-                f"If a BoundingBox is contained in the input sample, "
-                f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
-            )
-
-        return super().forward(*inputs)
-
-
-class RandomResize(Transform):
-    def __init__(
-        self,
-        min_size: int,
-        max_size: int,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
-    ) -> None:
-        super().__init__()
-        self.min_size = min_size
-        self.max_size = max_size
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        size = int(torch.randint(self.min_size, self.max_size, ()))
-        return dict(size=[size])
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.resize(inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias)
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
deleted file mode 100644
index 2ea3014aa6c..00000000000
--- a/torchvision/prototype/transforms/_meta.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-import PIL.Image
-
-import torch
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-
-
-class ConvertBoundingBoxFormat(Transform):
-    _transformed_types = (features.BoundingBox,)
-
-    def __init__(self, format: Union[str, features.BoundingBoxFormat]) -> None:
-        super().__init__()
-        if isinstance(format, str):
-            format = features.BoundingBoxFormat[format]
-        self.format = format
-
-    def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
-        output = F.convert_format_bounding_box(inpt, old_format=inpt.format, new_format=params["format"])
-        return features.BoundingBox.new_like(inpt, output, format=params["format"])
-
-
-class ConvertImageDtype(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image)
-
-    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
-        super().__init__()
-        self.dtype = dtype
-
-    def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> features.TensorImageType:
-        output = F.convert_image_dtype(inpt, dtype=self.dtype)
-        return output if features.is_simple_tensor(inpt) else features.Image.new_like(inpt, output, dtype=self.dtype)  # type: ignore[arg-type]
-
-
-class ConvertColorSpace(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image.Image)
-
-    def __init__(
-        self,
-        color_space: Union[str, features.ColorSpace],
-        old_color_space: Optional[Union[str, features.ColorSpace]] = None,
-        copy: bool = True,
-    ) -> None:
-        super().__init__()
-
-        if isinstance(color_space, str):
-            color_space = features.ColorSpace.from_str(color_space)
-        self.color_space = color_space
-
-        if isinstance(old_color_space, str):
-            old_color_space = features.ColorSpace.from_str(old_color_space)
-        self.old_color_space = old_color_space
-
-        self.copy = copy
-
-    def _transform(self, inpt: features.ImageType, params: Dict[str, Any]) -> features.ImageType:
-        return F.convert_color_space(
-            inpt, color_space=self.color_space, old_color_space=self.old_color_space, copy=self.copy
-        )
-
-
-class ClampBoundingBoxes(Transform):
-    _transformed_types = (features.BoundingBox,)
-
-    def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox:
-        output = F.clamp_bounding_box(inpt, format=inpt.format, image_size=inpt.image_size)
-        return features.BoundingBox.new_like(inpt, output)
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
deleted file mode 100644
index 976e9f8b5ff..00000000000
--- a/torchvision/prototype/transforms/_misc.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import functools
-from typing import Any, Callable, Dict, Sequence, Type, Union
-
-import PIL.Image
-
-import torch
-from torchvision.ops import remove_small_boxes
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-
-from ._utils import _setup_float_or_seq, _setup_size, has_any, query_bounding_box
-
-
-class Identity(Transform):
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt
-
-
-class Lambda(Transform):
-    def __init__(self, lambd: Callable[[Any], Any], *types: Type):
-        super().__init__()
-        self.lambd = lambd
-        self.types = types or (object,)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        if isinstance(inpt, self.types):
-            return self.lambd(inpt)
-        else:
-            return inpt
-
-    def extra_repr(self) -> str:
-        extras = []
-        name = getattr(self.lambd, "__name__", None)
-        if name:
-            extras.append(name)
-        extras.append(f"types={[type.__name__ for type in self.types]}")
-        return ", ".join(extras)
-
-
-class LinearTransformation(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image)
-
-    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
-        super().__init__()
-        if transformation_matrix.size(0) != transformation_matrix.size(1):
-            raise ValueError(
-                "transformation_matrix should be square. Got "
-                f"{tuple(transformation_matrix.size())} rectangular matrix."
-            )
-
-        if mean_vector.size(0) != transformation_matrix.size(0):
-            raise ValueError(
-                f"mean_vector should have the same length {mean_vector.size(0)}"
-                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
-            )
-
-        if transformation_matrix.device != mean_vector.device:
-            raise ValueError(
-                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
-            )
-
-        self.transformation_matrix = transformation_matrix
-        self.mean_vector = mean_vector
-
-    def forward(self, *inputs: Any) -> Any:
-        if has_any(inputs, PIL.Image.Image):
-            raise TypeError("LinearTransformation does not work on PIL Images")
-
-        return super().forward(*inputs)
-
-    def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> torch.Tensor:
-        # Image instance after linear transformation is not Image anymore due to unknown data range
-        # Thus we will return Tensor for input Image
-
-        shape = inpt.shape
-        n = shape[-3] * shape[-2] * shape[-1]
-        if n != self.transformation_matrix.shape[0]:
-            raise ValueError(
-                "Input tensor and transformation matrix have incompatible shape."
-                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
-                + f"{self.transformation_matrix.shape[0]}"
-            )
-
-        if inpt.device.type != self.mean_vector.device.type:
-            raise ValueError(
-                "Input tensor should be on the same device as transformation matrix and mean vector. "
-                f"Got {inpt.device} vs {self.mean_vector.device}"
-            )
-
-        flat_tensor = inpt.view(-1, n) - self.mean_vector
-        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
-        return transformed_tensor.view(shape)
-
-
-class Normalize(Transform):
-    _transformed_types = (features.Image, features.is_simple_tensor)
-
-    def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
-        super().__init__()
-        self.mean = list(mean)
-        self.std = list(std)
-        self.inplace = inplace
-
-    def _transform(self, inpt: features.TensorImageType, params: Dict[str, Any]) -> torch.Tensor:
-        return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
-
-    def forward(self, *inpts: Any) -> Any:
-        if has_any(inpts, PIL.Image.Image):
-            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
-        return super().forward(*inpts)
-
-
-class GaussianBlur(Transform):
-    def __init__(
-        self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
-    ) -> None:
-        super().__init__()
-        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
-        for ks in self.kernel_size:
-            if ks <= 0 or ks % 2 == 0:
-                raise ValueError("Kernel size value should be an odd and positive number.")
-
-        if isinstance(sigma, (int, float)):
-            if sigma <= 0:
-                raise ValueError("If sigma is a single number, it must be positive.")
-            sigma = float(sigma)
-        elif isinstance(sigma, Sequence) and len(sigma) == 2:
-            if not 0.0 < sigma[0] <= sigma[1]:
-                raise ValueError("sigma values should be positive and of the form (min, max).")
-        else:
-            raise TypeError("sigma should be a single int or float or a list/tuple with length 2 floats.")
-
-        self.sigma = _setup_float_or_seq(sigma, "sigma", 2)
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
-        return dict(sigma=[sigma, sigma])
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return F.gaussian_blur(inpt, self.kernel_size, **params)
-
-
-class ToDtype(Lambda):
-    def __init__(self, dtype: torch.dtype, *types: Type) -> None:
-        self.dtype = dtype
-        super().__init__(functools.partial(torch.Tensor.to, dtype=dtype), *types or (torch.Tensor,))
-
-    def extra_repr(self) -> str:
-        return ", ".join([f"dtype={self.dtype}", f"types={[type.__name__ for type in self.types]}"])
-
-
-class RemoveSmallBoundingBoxes(Transform):
-    _transformed_types = (features.BoundingBox, features.Mask, features.Label, features.OneHotLabel)
-
-    def __init__(self, min_size: float = 1.0) -> None:
-        super().__init__()
-        self.min_size = min_size
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        bounding_box = query_bounding_box(sample)
-
-        # TODO: We can improve performance here by not using the `remove_small_boxes` function. It requires the box to
-        #  be in XYXY format only to calculate the width and height internally. Thus, if the box is in XYWH or CXCYWH
-        #  format,we need to convert first just to afterwards compute the width and height again, although they were
-        #  there in the first place for these formats.
-        bounding_box = F.convert_format_bounding_box(
-            bounding_box, old_format=bounding_box.format, new_format=features.BoundingBoxFormat.XYXY
-        )
-        valid_indices = remove_small_boxes(bounding_box, min_size=self.min_size)
-
-        return dict(valid_indices=valid_indices)
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        return inpt.new_like(inpt, inpt[params["valid_indices"]])
diff --git a/torchvision/prototype/transforms/_presets.py b/torchvision/prototype/transforms/_presets.py
deleted file mode 100644
index a6980f3e135..00000000000
--- a/torchvision/prototype/transforms/_presets.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-This file is part of the private API. Please do not use directly these classes as they will be modified on
-future versions without warning. The classes should be accessed only via the transforms argument of Weights.
-"""
-from typing import List, Optional, Tuple, Union
-
-import PIL.Image
-
-import torch
-from torch import Tensor
-
-from . import functional as F, InterpolationMode
-
-__all__ = ["StereoMatching"]
-
-
-class StereoMatching(torch.nn.Module):
-    def __init__(
-        self,
-        *,
-        use_gray_scale: bool = False,
-        resize_size: Optional[Tuple[int, ...]],
-        mean: Tuple[float, ...] = (0.5, 0.5, 0.5),
-        std: Tuple[float, ...] = (0.5, 0.5, 0.5),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ) -> None:
-        super().__init__()
-
-        # pacify mypy
-        self.resize_size: Union[None, List]
-
-        if resize_size is not None:
-            self.resize_size = list(resize_size)
-        else:
-            self.resize_size = None
-
-        self.mean = list(mean)
-        self.std = list(std)
-        self.interpolation = interpolation
-        self.use_gray_scale = use_gray_scale
-
-    def forward(self, left_image: Tensor, right_image: Tensor) -> Tuple[Tensor, Tensor]:
-        def _process_image(img: PIL.Image.Image) -> Tensor:
-            if self.resize_size is not None:
-                img = F.resize(img, self.resize_size, interpolation=self.interpolation)
-            if not isinstance(img, Tensor):
-                img = F.pil_to_tensor(img)
-            if self.use_gray_scale is True:
-                img = F.rgb_to_grayscale(img)
-            img = F.convert_image_dtype(img, torch.float)
-            img = F.normalize(img, mean=self.mean, std=self.std)
-            img = img.contiguous()
-            return img
-
-        left_image = _process_image(left_image)
-        right_image = _process_image(right_image)
-        return left_image, right_image
-
-    def __repr__(self) -> str:
-        format_string = self.__class__.__name__ + "("
-        format_string += f"\n    resize_size={self.resize_size}"
-        format_string += f"\n    mean={self.mean}"
-        format_string += f"\n    std={self.std}"
-        format_string += f"\n    interpolation={self.interpolation}"
-        format_string += "\n)"
-        return format_string
-
-    def describe(self) -> str:
-        return (
-            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
-            f"The images are resized to ``resize_size={self.resize_size}`` using ``interpolation={self.interpolation}``. "
-            f"Finally the values are first rescaled to ``[0.0, 1.0]`` and then normalized using ``mean={self.mean}`` and "
-            f"``std={self.std}``."
-        )
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
deleted file mode 100644
index 54ae91b7965..00000000000
--- a/torchvision/prototype/transforms/_transform.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import enum
-from typing import Any, Callable, Dict, Tuple, Type, Union
-
-import PIL.Image
-import torch
-from torch import nn
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision.prototype import features
-from torchvision.prototype.transforms._utils import _isinstance
-from torchvision.utils import _log_api_usage_once
-
-
-class Transform(nn.Module):
-
-    # Class attribute defining transformed types. Other types are passed-through without any transformation
-    _transformed_types: Tuple[Union[Type, Callable[[Any], bool]], ...] = (
-        features.is_simple_tensor,
-        features._Feature,
-        PIL.Image.Image,
-    )
-
-    def __init__(self) -> None:
-        super().__init__()
-        _log_api_usage_once(self)
-
-    def _get_params(self, sample: Any) -> Dict[str, Any]:
-        return dict()
-
-    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
-        raise NotImplementedError
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        params = self._get_params(sample)
-
-        flat_inputs, spec = tree_flatten(sample)
-        flat_outputs = [
-            self._transform(inpt, params) if _isinstance(inpt, self._transformed_types) else inpt
-            for inpt in flat_inputs
-        ]
-        return tree_unflatten(flat_outputs, spec)
-
-    def extra_repr(self) -> str:
-        extra = []
-        for name, value in self.__dict__.items():
-            if name.startswith("_") or name == "training":
-                continue
-
-            if not isinstance(value, (bool, int, float, str, tuple, list, enum.Enum)):
-                continue
-
-            extra.append(f"{name}={value}")
-
-        return ", ".join(extra)
-
-
-class _RandomApplyTransform(Transform):
-    def __init__(self, p: float = 0.5) -> None:
-        if not (0.0 <= p <= 1.0):
-            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
-
-        super().__init__()
-        self.p = p
-
-    def forward(self, *inputs: Any) -> Any:
-        sample = inputs if len(inputs) > 1 else inputs[0]
-
-        if torch.rand(1) >= self.p:
-            return sample
-
-        return super().forward(sample)
diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py
deleted file mode 100644
index d4ee7387126..00000000000
--- a/torchvision/prototype/transforms/_type_conversion.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from typing import Any, cast, Dict, Optional, Union
-
-import numpy as np
-import PIL.Image
-import torch
-
-from torch.nn.functional import one_hot
-from torchvision.prototype import features
-from torchvision.prototype.transforms import functional as F, Transform
-
-
-class DecodeImage(Transform):
-    _transformed_types = (features.EncodedImage,)
-
-    def _transform(self, inpt: torch.Tensor, params: Dict[str, Any]) -> features.Image:
-        return cast(features.Image, F.decode_image_with_pil(inpt))
-
-
-class LabelToOneHot(Transform):
-    _transformed_types = (features.Label,)
-
-    def __init__(self, num_categories: int = -1):
-        super().__init__()
-        self.num_categories = num_categories
-
-    def _transform(self, inpt: features.Label, params: Dict[str, Any]) -> features.OneHotLabel:
-        num_categories = self.num_categories
-        if num_categories == -1 and inpt.categories is not None:
-            num_categories = len(inpt.categories)
-        output = one_hot(inpt, num_classes=num_categories)
-        return features.OneHotLabel(output, categories=inpt.categories)
-
-    def extra_repr(self) -> str:
-        if self.num_categories == -1:
-            return ""
-
-        return f"num_categories={self.num_categories}"
-
-
-class PILToTensor(Transform):
-    _transformed_types = (PIL.Image.Image,)
-
-    def _transform(self, inpt: Union[PIL.Image.Image], params: Dict[str, Any]) -> torch.Tensor:
-        return F.pil_to_tensor(inpt)
-
-
-class ToImageTensor(Transform):
-    _transformed_types = (features.is_simple_tensor, PIL.Image.Image, np.ndarray)
-
-    def _transform(
-        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
-    ) -> features.Image:
-        return cast(features.Image, F.to_image_tensor(inpt))
-
-
-class ToImagePIL(Transform):
-    _transformed_types = (features.is_simple_tensor, features.Image, np.ndarray)
-
-    def __init__(self, mode: Optional[str] = None) -> None:
-        super().__init__()
-        self.mode = mode
-
-    def _transform(
-        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
-    ) -> PIL.Image.Image:
-        return F.to_image_pil(inpt, mode=self.mode)
-
-
-# We changed the name to align them with the new naming scheme. Still, `ToPILImage` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-ToPILImage = ToImagePIL
diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py
deleted file mode 100644
index 219e6e50586..00000000000
--- a/torchvision/prototype/transforms/_utils.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import numbers
-from collections import defaultdict
-
-from typing import Any, Callable, Dict, Sequence, Tuple, Type, Union
-
-import PIL.Image
-
-from torch.utils._pytree import tree_flatten
-from torchvision._utils import sequence_to_str
-from torchvision.prototype import features
-from torchvision.prototype.features._feature import FillType
-
-from torchvision.prototype.transforms.functional._meta import get_chw
-from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
-
-from typing_extensions import Literal
-
-
-def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size: int = 2) -> Sequence[float]:
-    if not isinstance(arg, (float, Sequence)):
-        raise TypeError(f"{name} should be float or a sequence of floats. Got {type(arg)}")
-    if isinstance(arg, Sequence) and len(arg) != req_size:
-        raise ValueError(f"If {name} is a sequence its length should be one of {req_size}. Got {len(arg)}")
-    if isinstance(arg, Sequence):
-        for element in arg:
-            if not isinstance(element, float):
-                raise ValueError(f"{name} should be a sequence of floats. Got {type(element)}")
-
-    if isinstance(arg, float):
-        arg = [float(arg), float(arg)]
-    if isinstance(arg, (list, tuple)) and len(arg) == 1:
-        arg = [arg[0], arg[0]]
-    return arg
-
-
-def _check_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> None:
-    if isinstance(fill, dict):
-        for key, value in fill.items():
-            # Check key for type
-            _check_fill_arg(value)
-    else:
-        if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
-            raise TypeError("Got inappropriate fill arg")
-
-
-def _setup_fill_arg(fill: Union[FillType, Dict[Type, FillType]]) -> Dict[Type, FillType]:
-    _check_fill_arg(fill)
-
-    if isinstance(fill, dict):
-        return fill
-
-    return defaultdict(lambda: fill)  # type: ignore[return-value, arg-type]
-
-
-def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
-    if not isinstance(padding, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate padding arg")
-
-    if isinstance(padding, (tuple, list)) and len(padding) not in [1, 2, 4]:
-        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
-
-
-# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums)
-# https://github.com/pytorch/vision/issues/6250
-def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
-        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-
-def query_bounding_box(sample: Any) -> features.BoundingBox:
-    flat_sample, _ = tree_flatten(sample)
-    bounding_boxes = {item for item in flat_sample if isinstance(item, features.BoundingBox)}
-    if not bounding_boxes:
-        raise TypeError("No bounding box was found in the sample")
-    elif len(bounding_boxes) > 1:
-        raise ValueError("Found multiple bounding boxes in the sample")
-    return bounding_boxes.pop()
-
-
-def query_chw(sample: Any) -> Tuple[int, int, int]:
-    flat_sample, _ = tree_flatten(sample)
-    chws = {
-        get_chw(item)
-        for item in flat_sample
-        if isinstance(item, (features.Image, PIL.Image.Image)) or features.is_simple_tensor(item)
-    }
-    if not chws:
-        raise TypeError("No image was found in the sample")
-    elif len(chws) > 1:
-        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
-    return chws.pop()
-
-
-def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
-    for type_or_check in types_or_checks:
-        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
-            return True
-    return False
-
-
-def has_any(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    flat_sample, _ = tree_flatten(sample)
-    for obj in flat_sample:
-        if _isinstance(obj, types_or_checks):
-            return True
-    return False
-
-
-def has_all(sample: Any, *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
-    flat_sample, _ = tree_flatten(sample)
-    for type_or_check in types_or_checks:
-        for obj in flat_sample:
-            if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
-                break
-        else:
-            return False
-    return True
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
deleted file mode 100644
index f081d101dff..00000000000
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# TODO: Add _log_api_usage_once() in all mid-level kernels. If they remain not jit-scriptable we can use decorators
-
-from torchvision.transforms import InterpolationMode  # usort: skip
-from ._meta import (
-    clamp_bounding_box,
-    convert_format_bounding_box,
-    convert_color_space_image_tensor,
-    convert_color_space_image_pil,
-    convert_color_space,
-    get_dimensions,
-    get_image_num_channels,
-    get_num_channels,
-    get_spatial_size,
-)  # usort: skip
-
-from ._augment import erase, erase_image_pil, erase_image_tensor
-from ._color import (
-    adjust_brightness,
-    adjust_brightness_image_pil,
-    adjust_brightness_image_tensor,
-    adjust_contrast,
-    adjust_contrast_image_pil,
-    adjust_contrast_image_tensor,
-    adjust_gamma,
-    adjust_gamma_image_pil,
-    adjust_gamma_image_tensor,
-    adjust_hue,
-    adjust_hue_image_pil,
-    adjust_hue_image_tensor,
-    adjust_saturation,
-    adjust_saturation_image_pil,
-    adjust_saturation_image_tensor,
-    adjust_sharpness,
-    adjust_sharpness_image_pil,
-    adjust_sharpness_image_tensor,
-    autocontrast,
-    autocontrast_image_pil,
-    autocontrast_image_tensor,
-    equalize,
-    equalize_image_pil,
-    equalize_image_tensor,
-    invert,
-    invert_image_pil,
-    invert_image_tensor,
-    posterize,
-    posterize_image_pil,
-    posterize_image_tensor,
-    solarize,
-    solarize_image_pil,
-    solarize_image_tensor,
-)
-from ._geometry import (
-    affine,
-    affine_bounding_box,
-    affine_image_pil,
-    affine_image_tensor,
-    affine_mask,
-    center_crop,
-    center_crop_bounding_box,
-    center_crop_image_pil,
-    center_crop_image_tensor,
-    center_crop_mask,
-    crop,
-    crop_bounding_box,
-    crop_image_pil,
-    crop_image_tensor,
-    crop_mask,
-    elastic,
-    elastic_bounding_box,
-    elastic_image_pil,
-    elastic_image_tensor,
-    elastic_mask,
-    elastic_transform,
-    five_crop,
-    five_crop_image_pil,
-    five_crop_image_tensor,
-    hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
-    horizontal_flip,
-    horizontal_flip_bounding_box,
-    horizontal_flip_image_pil,
-    horizontal_flip_image_tensor,
-    horizontal_flip_mask,
-    pad,
-    pad_bounding_box,
-    pad_image_pil,
-    pad_image_tensor,
-    pad_mask,
-    perspective,
-    perspective_bounding_box,
-    perspective_image_pil,
-    perspective_image_tensor,
-    perspective_mask,
-    resize,
-    resize_bounding_box,
-    resize_image_pil,
-    resize_image_tensor,
-    resize_mask,
-    resized_crop,
-    resized_crop_bounding_box,
-    resized_crop_image_pil,
-    resized_crop_image_tensor,
-    resized_crop_mask,
-    rotate,
-    rotate_bounding_box,
-    rotate_image_pil,
-    rotate_image_tensor,
-    rotate_mask,
-    ten_crop,
-    ten_crop_image_pil,
-    ten_crop_image_tensor,
-    vertical_flip,
-    vertical_flip_bounding_box,
-    vertical_flip_image_pil,
-    vertical_flip_image_tensor,
-    vertical_flip_mask,
-    vflip,
-)
-from ._misc import gaussian_blur, gaussian_blur_image_pil, gaussian_blur_image_tensor, normalize, normalize_image_tensor
-from ._type_conversion import (
-    convert_image_dtype,
-    decode_image_with_pil,
-    decode_video_with_av,
-    pil_to_tensor,
-    to_image_pil,
-    to_image_tensor,
-    to_pil_image,
-)
-
-from ._deprecated import get_image_size, rgb_to_grayscale, to_grayscale, to_tensor  # usort: skip
diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py
deleted file mode 100644
index fb48c35888d..00000000000
--- a/torchvision/prototype/transforms/functional/_augment.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import PIL.Image
-
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
-from torchvision.transforms.functional import pil_to_tensor, to_pil_image
-
-erase_image_tensor = _FT.erase
-
-
-@torch.jit.unused
-def erase_image_pil(
-    image: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
-) -> PIL.Image.Image:
-    t_img = pil_to_tensor(image)
-    output = erase_image_tensor(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-    return to_pil_image(output, mode=image.mode)
-
-
-def erase(
-    inpt: features.ImageTypeJIT,
-    i: int,
-    j: int,
-    h: int,
-    w: int,
-    v: torch.Tensor,
-    inplace: bool = False,
-) -> features.ImageTypeJIT:
-    if isinstance(inpt, torch.Tensor):
-        output = erase_image_tensor(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
-        if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = features.Image.new_like(inpt, output)
-        return output
-    else:  # isinstance(inpt, PIL.Image.Image):
-        return erase_image_pil(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/prototype/transforms/functional/_color.py b/torchvision/prototype/transforms/functional/_color.py
deleted file mode 100644
index f375cb048c6..00000000000
--- a/torchvision/prototype/transforms/functional/_color.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
-
-adjust_brightness_image_tensor = _FT.adjust_brightness
-adjust_brightness_image_pil = _FP.adjust_brightness
-
-
-def adjust_brightness(inpt: features.InputTypeJIT, brightness_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return adjust_brightness_image_tensor(inpt, brightness_factor=brightness_factor)
-    elif isinstance(inpt, features._Feature):
-        return inpt.adjust_brightness(brightness_factor=brightness_factor)
-    else:
-        return adjust_brightness_image_pil(inpt, brightness_factor=brightness_factor)
-
-
-adjust_saturation_image_tensor = _FT.adjust_saturation
-adjust_saturation_image_pil = _FP.adjust_saturation
-
-
-def adjust_saturation(inpt: features.InputTypeJIT, saturation_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return adjust_saturation_image_tensor(inpt, saturation_factor=saturation_factor)
-    elif isinstance(inpt, features._Feature):
-        return inpt.adjust_saturation(saturation_factor=saturation_factor)
-    else:
-        return adjust_saturation_image_pil(inpt, saturation_factor=saturation_factor)
-
-
-adjust_contrast_image_tensor = _FT.adjust_contrast
-adjust_contrast_image_pil = _FP.adjust_contrast
-
-
-def adjust_contrast(inpt: features.InputTypeJIT, contrast_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return adjust_contrast_image_tensor(inpt, contrast_factor=contrast_factor)
-    elif isinstance(inpt, features._Feature):
-        return inpt.adjust_contrast(contrast_factor=contrast_factor)
-    else:
-        return adjust_contrast_image_pil(inpt, contrast_factor=contrast_factor)
-
-
-adjust_sharpness_image_tensor = _FT.adjust_sharpness
-adjust_sharpness_image_pil = _FP.adjust_sharpness
-
-
-def adjust_sharpness(inpt: features.InputTypeJIT, sharpness_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return adjust_sharpness_image_tensor(inpt, sharpness_factor=sharpness_factor)
-    elif isinstance(inpt, features._Feature):
-        return inpt.adjust_sharpness(sharpness_factor=sharpness_factor)
-    else:
-        return adjust_sharpness_image_pil(inpt, sharpness_factor=sharpness_factor)
-
-
-adjust_hue_image_tensor = _FT.adjust_hue
-adjust_hue_image_pil = _FP.adjust_hue
-
-
-def adjust_hue(inpt: features.InputTypeJIT, hue_factor: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return adjust_hue_image_tensor(inpt, hue_factor=hue_factor)
-    elif isinstance(inpt, features._Feature):
-        return inpt.adjust_hue(hue_factor=hue_factor)
-    else:
-        return adjust_hue_image_pil(inpt, hue_factor=hue_factor)
-
-
-adjust_gamma_image_tensor = _FT.adjust_gamma
-adjust_gamma_image_pil = _FP.adjust_gamma
-
-
-def adjust_gamma(inpt: features.InputTypeJIT, gamma: float, gain: float = 1) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return adjust_gamma_image_tensor(inpt, gamma=gamma, gain=gain)
-    elif isinstance(inpt, features._Feature):
-        return inpt.adjust_gamma(gamma=gamma, gain=gain)
-    else:
-        return adjust_gamma_image_pil(inpt, gamma=gamma, gain=gain)
-
-
-posterize_image_tensor = _FT.posterize
-posterize_image_pil = _FP.posterize
-
-
-def posterize(inpt: features.InputTypeJIT, bits: int) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return posterize_image_tensor(inpt, bits=bits)
-    elif isinstance(inpt, features._Feature):
-        return inpt.posterize(bits=bits)
-    else:
-        return posterize_image_pil(inpt, bits=bits)
-
-
-solarize_image_tensor = _FT.solarize
-solarize_image_pil = _FP.solarize
-
-
-def solarize(inpt: features.InputTypeJIT, threshold: float) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return solarize_image_tensor(inpt, threshold=threshold)
-    elif isinstance(inpt, features._Feature):
-        return inpt.solarize(threshold=threshold)
-    else:
-        return solarize_image_pil(inpt, threshold=threshold)
-
-
-autocontrast_image_tensor = _FT.autocontrast
-autocontrast_image_pil = _FP.autocontrast
-
-
-def autocontrast(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return autocontrast_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
-        return inpt.autocontrast()
-    else:
-        return autocontrast_image_pil(inpt)
-
-
-equalize_image_tensor = _FT.equalize
-equalize_image_pil = _FP.equalize
-
-
-def equalize(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return equalize_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
-        return inpt.equalize()
-    else:
-        return equalize_image_pil(inpt)
-
-
-invert_image_tensor = _FT.invert
-invert_image_pil = _FP.invert
-
-
-def invert(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return invert_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
-        return inpt.invert()
-    else:
-        return invert_image_pil(inpt)
diff --git a/torchvision/prototype/transforms/functional/_deprecated.py b/torchvision/prototype/transforms/functional/_deprecated.py
deleted file mode 100644
index cbdea5130ef..00000000000
--- a/torchvision/prototype/transforms/functional/_deprecated.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import warnings
-from typing import Any, List
-
-import PIL.Image
-import torch
-
-from torchvision.prototype import features
-from torchvision.transforms import functional as _F
-
-
-@torch.jit.unused
-def to_grayscale(inpt: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
-    call = ", num_output_channels=3" if num_output_channels == 3 else ""
-    replacement = "convert_color_space(..., color_space=features.ColorSpace.GRAY)"
-    if num_output_channels == 3:
-        replacement = f"convert_color_space({replacement}, color_space=features.ColorSpace.RGB)"
-    warnings.warn(
-        f"The function `to_grayscale(...{call})` is deprecated in will be removed in a future release. "
-        f"Instead, please use `{replacement}`.",
-    )
-
-    return _F.to_grayscale(inpt, num_output_channels=num_output_channels)
-
-
-def rgb_to_grayscale(inpt: features.LegacyImageTypeJIT, num_output_channels: int = 1) -> features.LegacyImageTypeJIT:
-    old_color_space = (
-        features._image._from_tensor_shape(inpt.shape)  # type: ignore[arg-type]
-        if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Image))
-        else None
-    )
-
-    call = ", num_output_channels=3" if num_output_channels == 3 else ""
-    replacement = (
-        f"convert_color_space(..., color_space=features.ColorSpace.GRAY"
-        f"{f', old_color_space=features.ColorSpace.{old_color_space}' if old_color_space is not None else ''})"
-    )
-    if num_output_channels == 3:
-        replacement = (
-            f"convert_color_space({replacement}, color_space=features.ColorSpace.RGB"
-            f"{f', old_color_space=features.ColorSpace.GRAY' if old_color_space is not None else ''})"
-        )
-    warnings.warn(
-        f"The function `rgb_to_grayscale(...{call})` is deprecated in will be removed in a future release. "
-        f"Instead, please use `{replacement}`.",
-    )
-
-    return _F.rgb_to_grayscale(inpt, num_output_channels=num_output_channels)
-
-
-@torch.jit.unused
-def to_tensor(inpt: Any) -> torch.Tensor:
-    warnings.warn(
-        "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
-        "Instead, please use `to_image_tensor(...)` followed by `convert_image_dtype(...)`."
-    )
-    return _F.to_tensor(inpt)
-
-
-def get_image_size(inpt: features.ImageTypeJIT) -> List[int]:
-    warnings.warn(
-        "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
-        "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
-    )
-    return _F.get_image_size(inpt)
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
deleted file mode 100644
index 6a035b25705..00000000000
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ /dev/null
@@ -1,1314 +0,0 @@
-import numbers
-import warnings
-from typing import List, Optional, Sequence, Tuple, Union
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
-from torchvision.transforms.functional import (
-    _compute_resized_output_size as __compute_resized_output_size,
-    _get_inverse_affine_matrix,
-    InterpolationMode,
-    pil_modes_mapping,
-    pil_to_tensor,
-    to_pil_image,
-)
-from torchvision.transforms.functional_tensor import (
-    _cast_squeeze_in,
-    _cast_squeeze_out,
-    _parse_pad_padding,
-    interpolate,
-)
-
-from ._meta import convert_format_bounding_box, get_dimensions_image_pil, get_dimensions_image_tensor
-
-horizontal_flip_image_tensor = _FT.hflip
-horizontal_flip_image_pil = _FP.hflip
-
-
-def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
-    return horizontal_flip_image_tensor(mask)
-
-
-def horizontal_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
-) -> torch.Tensor:
-    shape = bounding_box.shape
-
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    bounding_box[:, [0, 2]] = image_size[1] - bounding_box[:, [2, 0]]
-
-    return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(shape)
-
-
-def horizontal_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return horizontal_flip_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
-        return inpt.horizontal_flip()
-    else:
-        return horizontal_flip_image_pil(inpt)
-
-
-vertical_flip_image_tensor = _FT.vflip
-vertical_flip_image_pil = _FP.vflip
-
-
-def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
-    return vertical_flip_image_tensor(mask)
-
-
-def vertical_flip_bounding_box(
-    bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
-) -> torch.Tensor:
-    shape = bounding_box.shape
-
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    bounding_box[:, [1, 3]] = image_size[0] - bounding_box[:, [3, 1]]
-
-    return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(shape)
-
-
-def vertical_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return vertical_flip_image_tensor(inpt)
-    elif isinstance(inpt, features._Feature):
-        return inpt.vertical_flip()
-    else:
-        return vertical_flip_image_pil(inpt)
-
-
-# We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
-# prevalent and well understood. Thus, we just alias them without deprecating the old names.
-hflip = horizontal_flip
-vflip = vertical_flip
-
-
-def _compute_resized_output_size(
-    image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
-) -> List[int]:
-    if isinstance(size, int):
-        size = [size]
-    return __compute_resized_output_size(image_size, size=size, max_size=max_size)
-
-
-def resize_image_tensor(
-    image: torch.Tensor,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-    antialias: bool = False,
-) -> torch.Tensor:
-    num_channels, old_height, old_width = get_dimensions_image_tensor(image)
-    new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
-    extra_dims = image.shape[:-3]
-
-    if image.numel() > 0:
-        image = image.view(-1, num_channels, old_height, old_width)
-
-        # This is a perf hack to avoid slow channels_last upsample code path
-        # Related issue: https://github.com/pytorch/pytorch/issues/83840
-        # We are transforming (N, 1, H, W) into (N, 2, H, W) to force to take channels_first path
-        if image.shape[1] == 1 and interpolation == InterpolationMode.NEAREST:
-            # Below code is copied from _FT.resize
-            # This is due to the fact that we need to apply the hack on casted image and not before
-            # Otherwise, image will be copied while cast to float and interpolate will work on twice more data
-            image, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(image, [torch.float32, torch.float64])
-
-            shape = (image.shape[0], 2, image.shape[2], image.shape[3])
-            image = image.expand(shape)
-
-            image = interpolate(
-                image, size=[new_height, new_width], mode=interpolation.value, align_corners=None, antialias=False
-            )
-
-            image = image[:, 0, ...]
-            image = _cast_squeeze_out(image, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
-
-        else:
-            image = _FT.resize(
-                image,
-                size=[new_height, new_width],
-                interpolation=interpolation.value,
-                antialias=antialias,
-            )
-
-    return image.view(extra_dims + (num_channels, new_height, new_width))
-
-
-@torch.jit.unused
-def resize_image_pil(
-    image: PIL.Image.Image,
-    size: Union[Sequence[int], int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-) -> PIL.Image.Image:
-    size = _compute_resized_output_size(image.size[::-1], size=size, max_size=max_size)  # type: ignore[arg-type]
-    return _FP.resize(image, size, interpolation=pil_modes_mapping[interpolation])
-
-
-def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = None) -> torch.Tensor:
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = resize_image_tensor(mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def resize_bounding_box(
-    bounding_box: torch.Tensor, image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    old_height, old_width = image_size
-    new_height, new_width = _compute_resized_output_size(image_size, size=size, max_size=max_size)
-    ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
-    return (
-        bounding_box.view(-1, 2, 2).mul(ratios).to(bounding_box.dtype).view(bounding_box.shape),
-        (new_height, new_width),
-    )
-
-
-def resize(
-    inpt: features.InputTypeJIT,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        antialias = False if antialias is None else antialias
-        return resize_image_tensor(inpt, size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    elif isinstance(inpt, features._Feature):
-        antialias = False if antialias is None else antialias
-        return inpt.resize(size, interpolation=interpolation, max_size=max_size, antialias=antialias)
-    else:
-        if antialias is not None and not antialias:
-            warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
-        return resize_image_pil(inpt, size, interpolation=interpolation, max_size=max_size)
-
-
-def _affine_parse_args(
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    center: Optional[List[float]] = None,
-) -> Tuple[float, List[float], List[float], Optional[List[float]]]:
-    if not isinstance(angle, (int, float)):
-        raise TypeError("Argument angle should be int or float")
-
-    if not isinstance(translate, (list, tuple)):
-        raise TypeError("Argument translate should be a sequence")
-
-    if len(translate) != 2:
-        raise ValueError("Argument translate should be a sequence of length 2")
-
-    if scale <= 0.0:
-        raise ValueError("Argument scale should be positive")
-
-    if not isinstance(shear, (numbers.Number, (list, tuple))):
-        raise TypeError("Shear should be either a single value or a sequence of two values")
-
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
-    if isinstance(angle, int):
-        angle = float(angle)
-
-    if isinstance(translate, tuple):
-        translate = list(translate)
-
-    if isinstance(shear, numbers.Number):
-        shear = [shear, 0.0]
-
-    if isinstance(shear, tuple):
-        shear = list(shear)
-
-    if len(shear) == 1:
-        shear = [shear[0], shear[0]]
-
-    if len(shear) != 2:
-        raise ValueError(f"Shear should be a sequence containing two values. Got {shear}")
-
-    if center is not None:
-        if not isinstance(center, (list, tuple)):
-            raise TypeError("Argument center should be a sequence")
-        else:
-            center = [float(c) for c in center]
-
-    return angle, translate, shear, center
-
-
-def affine_image_tensor(
-    image: torch.Tensor,
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    if image.numel() == 0:
-        return image
-
-    num_channels, height, width = image.shape[-3:]
-    extra_dims = image.shape[:-3]
-    image = image.view(-1, num_channels, height, width)
-
-    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
-
-    center_f = [0.0, 0.0]
-    if center is not None:
-        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-        center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
-
-    translate_f = [1.0 * t for t in translate]
-    matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
-
-    output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill)
-    return output.view(extra_dims + (num_channels, height, width))
-
-
-@torch.jit.unused
-def affine_image_pil(
-    image: PIL.Image.Image,
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> PIL.Image.Image:
-    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
-
-    # center = (img_size[0] * 0.5 + 0.5, img_size[1] * 0.5 + 0.5)
-    # it is visually better to estimate the center without 0.5 offset
-    # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
-    if center is None:
-        _, height, width = get_dimensions_image_pil(image)
-        center = [width * 0.5, height * 0.5]
-    matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
-
-    return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
-
-
-def _affine_bounding_box_xyxy(
-    bounding_box: torch.Tensor,
-    image_size: Tuple[int, int],
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    center: Optional[List[float]] = None,
-    expand: bool = False,
-) -> torch.Tensor:
-    angle, translate, shear, center = _affine_parse_args(
-        angle, translate, scale, shear, InterpolationMode.NEAREST, center
-    )
-
-    if center is None:
-        height, width = image_size
-        center = [width * 0.5, height * 0.5]
-
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
-
-    affine_matrix = torch.tensor(
-        _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False),
-        dtype=dtype,
-        device=device,
-    ).view(2, 3)
-    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
-    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
-    # Single point structure is similar to
-    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
-    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
-    # 2) Now let's transform the points using affine matrix
-    transformed_points = torch.matmul(points, affine_matrix.T)
-    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
-    # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.view(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
-
-    if expand:
-        # Compute minimum point for transformed image frame:
-        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-        height, width = image_size
-        points = torch.tensor(
-            [
-                [0.0, 0.0, 1.0],
-                [0.0, 1.0 * height, 1.0],
-                [1.0 * width, 1.0 * height, 1.0],
-                [1.0 * width, 0.0, 1.0],
-            ],
-            dtype=dtype,
-            device=device,
-        )
-        new_points = torch.matmul(points, affine_matrix.T)
-        tr, _ = torch.min(new_points, dim=0, keepdim=True)
-        # Translate bounding boxes
-        out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0]
-        out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1]
-
-    return out_bboxes.to(bounding_box.dtype)
-
-
-def affine_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    out_bboxes = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center)
-
-    # out_bboxes should be of shape [N boxes, 4]
-
-    return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def affine_mask(
-    mask: torch.Tensor,
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = affine_image_tensor(
-        mask,
-        angle=angle,
-        translate=translate,
-        scale=scale,
-        shear=shear,
-        interpolation=InterpolationMode.NEAREST,
-        fill=fill,
-        center=center,
-    )
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def _convert_fill_arg(fill: features.FillType) -> features.FillTypeJIT:
-    # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
-    # So, we can't reassign fill to 0
-    # if fill is None:
-    #     fill = 0
-    if fill is None:
-        return fill
-
-    # This cast does Sequence -> List[float] to please mypy and torch.jit.script
-    if not isinstance(fill, (int, float)):
-        fill = [float(v) for v in list(fill)]
-    return fill
-
-
-def affine(
-    inpt: features.InputTypeJIT,
-    angle: Union[int, float],
-    translate: List[float],
-    scale: float,
-    shear: List[float],
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> features.InputTypeJIT:
-    # TODO: consider deprecating integers from angle and shear on the future
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return affine_image_tensor(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-    elif isinstance(inpt, features._Feature):
-        return inpt.affine(
-            angle, translate=translate, scale=scale, shear=shear, interpolation=interpolation, fill=fill, center=center
-        )
-    else:
-        return affine_image_pil(
-            inpt,
-            angle,
-            translate=translate,
-            scale=scale,
-            shear=shear,
-            interpolation=interpolation,
-            fill=fill,
-            center=center,
-        )
-
-
-def rotate_image_tensor(
-    image: torch.Tensor,
-    angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    expand: bool = False,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    num_channels, height, width = image.shape[-3:]
-    extra_dims = image.shape[:-3]
-
-    center_f = [0.0, 0.0]
-    if center is not None:
-        if expand:
-            warnings.warn("The provided center argument has no effect on the result if expand is True")
-        else:
-            # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
-            center_f = [1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])]
-
-    # due to current incoherence of rotation angle direction between affine and rotate implementations
-    # we need to set -angle.
-    matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
-
-    if image.numel() > 0:
-        image = _FT.rotate(
-            image.view(-1, num_channels, height, width),
-            matrix,
-            interpolation=interpolation.value,
-            expand=expand,
-            fill=fill,
-        )
-        new_height, new_width = image.shape[-2:]
-    else:
-        new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height)
-
-    return image.view(extra_dims + (num_channels, new_height, new_width))
-
-
-@torch.jit.unused
-def rotate_image_pil(
-    image: PIL.Image.Image,
-    angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    expand: bool = False,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> PIL.Image.Image:
-    if center is not None and expand:
-        warnings.warn("The provided center argument has no effect on the result if expand is True")
-        center = None
-
-    return _FP.rotate(
-        image, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center
-    )
-
-
-def rotate_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
-    angle: float,
-    expand: bool = False,
-    center: Optional[List[float]] = None,
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    if center is not None and expand:
-        warnings.warn("The provided center argument has no effect on the result if expand is True")
-        center = None
-
-    original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    out_bboxes = _affine_bounding_box_xyxy(
-        bounding_box,
-        image_size,
-        angle=-angle,
-        translate=[0.0, 0.0],
-        scale=1.0,
-        shear=[0.0, 0.0],
-        center=center,
-        expand=expand,
-    )
-
-    if expand:
-        # TODO: Move this computation inside of `_affine_bounding_box_xyxy` to avoid computing the rotation and points
-        #  matrix twice
-        height, width = image_size
-        rotation_matrix = _get_inverse_affine_matrix([0.0, 0.0], angle, [0.0, 0.0], 1.0, [0.0, 0.0])
-        new_width, new_height = _FT._compute_affine_output_size(rotation_matrix, width, height)
-        image_size = (new_height, new_width)
-
-    return (
-        convert_format_bounding_box(
-            out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        ).view(original_shape),
-        image_size,
-    )
-
-
-def rotate_mask(
-    mask: torch.Tensor,
-    angle: float,
-    expand: bool = False,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> torch.Tensor:
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = rotate_image_tensor(
-        mask,
-        angle=angle,
-        expand=expand,
-        interpolation=InterpolationMode.NEAREST,
-        fill=fill,
-        center=center,
-    )
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def rotate(
-    inpt: features.InputTypeJIT,
-    angle: float,
-    interpolation: InterpolationMode = InterpolationMode.NEAREST,
-    expand: bool = False,
-    fill: features.FillTypeJIT = None,
-    center: Optional[List[float]] = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return rotate_image_tensor(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    elif isinstance(inpt, features._Feature):
-        return inpt.rotate(angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-    else:
-        return rotate_image_pil(inpt, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
-
-
-pad_image_pil = _FP.pad
-
-
-def pad_image_tensor(
-    image: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: features.FillTypeJIT = None,
-    padding_mode: str = "constant",
-) -> torch.Tensor:
-    if fill is None:
-        # This is a JIT workaround
-        return _pad_with_scalar_fill(image, padding, fill=None, padding_mode=padding_mode)
-    elif isinstance(fill, (int, float)) or len(fill) == 1:
-        fill_number = fill[0] if isinstance(fill, list) else fill
-        return _pad_with_scalar_fill(image, padding, fill=fill_number, padding_mode=padding_mode)
-    else:
-        return _pad_with_vector_fill(image, padding, fill=fill, padding_mode=padding_mode)
-
-
-def _pad_with_scalar_fill(
-    image: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: Union[int, float, None],
-    padding_mode: str = "constant",
-) -> torch.Tensor:
-    num_channels, height, width = image.shape[-3:]
-    extra_dims = image.shape[:-3]
-
-    if image.numel() > 0:
-        image = _FT.pad(
-            img=image.view(-1, num_channels, height, width), padding=padding, fill=fill, padding_mode=padding_mode
-        )
-        new_height, new_width = image.shape[-2:]
-    else:
-        left, right, top, bottom = _FT._parse_pad_padding(padding)
-        new_height = height + top + bottom
-        new_width = width + left + right
-
-    return image.view(extra_dims + (num_channels, new_height, new_width))
-
-
-# TODO: This should be removed once pytorch pad supports non-scalar padding values
-def _pad_with_vector_fill(
-    image: torch.Tensor,
-    padding: Union[int, List[int]],
-    fill: List[float],
-    padding_mode: str = "constant",
-) -> torch.Tensor:
-    if padding_mode != "constant":
-        raise ValueError(f"Padding mode '{padding_mode}' is not supported if fill is not scalar")
-
-    output = _pad_with_scalar_fill(image, padding, fill=0, padding_mode="constant")
-    left, right, top, bottom = _parse_pad_padding(padding)
-    fill = torch.tensor(fill, dtype=image.dtype, device=image.device).view(-1, 1, 1)
-
-    if top > 0:
-        output[..., :top, :] = fill
-    if left > 0:
-        output[..., :, :left] = fill
-    if bottom > 0:
-        output[..., -bottom:, :] = fill
-    if right > 0:
-        output[..., :, -right:] = fill
-    return output
-
-
-def pad_mask(
-    mask: torch.Tensor,
-    padding: Union[int, List[int]],
-    padding_mode: str = "constant",
-    fill: features.FillTypeJIT = None,
-) -> torch.Tensor:
-    if fill is None:
-        fill = 0
-
-    if isinstance(fill, list):
-        raise ValueError("Non-scalar fill value is not supported")
-
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = pad_image_tensor(mask, padding=padding, fill=fill, padding_mode=padding_mode)
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def pad_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
-    padding: Union[int, List[int]],
-    padding_mode: str = "constant",
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    if padding_mode not in ["constant"]:
-        # TODO: add support of other padding modes
-        raise ValueError(f"Padding mode '{padding_mode}' is not supported with bounding boxes")
-
-    left, right, top, bottom = _parse_pad_padding(padding)
-
-    bounding_box = bounding_box.clone()
-
-    # this works without conversion since padding only affects xy coordinates
-    bounding_box[..., 0] += left
-    bounding_box[..., 1] += top
-    if format == features.BoundingBoxFormat.XYXY:
-        bounding_box[..., 2] += left
-        bounding_box[..., 3] += top
-
-    height, width = image_size
-    height += top + bottom
-    width += left + right
-
-    return bounding_box, (height, width)
-
-
-def pad(
-    inpt: features.InputTypeJIT,
-    padding: Union[int, List[int]],
-    fill: features.FillTypeJIT = None,
-    padding_mode: str = "constant",
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return pad_image_tensor(inpt, padding, fill=fill, padding_mode=padding_mode)
-
-    elif isinstance(inpt, features._Feature):
-        return inpt.pad(padding, fill=fill, padding_mode=padding_mode)
-    else:
-        return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
-
-
-crop_image_tensor = _FT.crop
-crop_image_pil = _FP.crop
-
-
-def crop_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    )
-
-    # Crop or implicit pad if left and/or top have negative values:
-    bounding_box[..., 0::2] -= left
-    bounding_box[..., 1::2] -= top
-
-    return (
-        convert_format_bounding_box(
-            bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-        ),
-        (height, width),
-    )
-
-
-def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-    return crop_image_tensor(mask, top, left, height, width)
-
-
-def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: int) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return crop_image_tensor(inpt, top, left, height, width)
-    elif isinstance(inpt, features._Feature):
-        return inpt.crop(top, left, height, width)
-    else:
-        return crop_image_pil(inpt, top, left, height, width)
-
-
-def perspective_image_tensor(
-    image: torch.Tensor,
-    perspective_coeffs: List[float],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
-) -> torch.Tensor:
-    return _FT.perspective(image, perspective_coeffs, interpolation=interpolation.value, fill=fill)
-
-
-@torch.jit.unused
-def perspective_image_pil(
-    image: PIL.Image.Image,
-    perspective_coeffs: List[float],
-    interpolation: InterpolationMode = InterpolationMode.BICUBIC,
-    fill: features.FillTypeJIT = None,
-) -> PIL.Image.Image:
-    return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
-
-
-def perspective_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    perspective_coeffs: List[float],
-) -> torch.Tensor:
-
-    if len(perspective_coeffs) != 8:
-        raise ValueError("Argument perspective_coeffs should have 8 float values")
-
-    original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
-    device = bounding_box.device
-
-    # perspective_coeffs are computed as endpoint -> start point
-    # We have to invert perspective_coeffs for bboxes:
-    # (x, y) - end point and (x_out, y_out) - start point
-    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    # and we would like to get:
-    # x = (inv_coeffs[0] * x_out + inv_coeffs[1] * y_out + inv_coeffs[2])
-    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
-    # y = (inv_coeffs[3] * x_out + inv_coeffs[4] * y_out + inv_coeffs[5])
-    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
-    # and compute inv_coeffs in terms of coeffs
-
-    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
-    if denom == 0:
-        raise RuntimeError(
-            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform bounding boxes. "
-            f"Denominator is zero, denom={denom}"
-        )
-
-    inv_coeffs = [
-        (perspective_coeffs[4] - perspective_coeffs[5] * perspective_coeffs[7]) / denom,
-        (-perspective_coeffs[1] + perspective_coeffs[2] * perspective_coeffs[7]) / denom,
-        (perspective_coeffs[1] * perspective_coeffs[5] - perspective_coeffs[2] * perspective_coeffs[4]) / denom,
-        (-perspective_coeffs[3] + perspective_coeffs[5] * perspective_coeffs[6]) / denom,
-        (perspective_coeffs[0] - perspective_coeffs[2] * perspective_coeffs[6]) / denom,
-        (-perspective_coeffs[0] * perspective_coeffs[5] + perspective_coeffs[2] * perspective_coeffs[3]) / denom,
-        (-perspective_coeffs[4] * perspective_coeffs[6] + perspective_coeffs[3] * perspective_coeffs[7]) / denom,
-        (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
-    ]
-
-    theta1 = torch.tensor(
-        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
-        dtype=dtype,
-        device=device,
-    )
-
-    theta2 = torch.tensor(
-        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
-    )
-
-    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
-    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
-    # Single point structure is similar to
-    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
-    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
-    # 2) Now let's transform the points using perspective matrices
-    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-
-    numer_points = torch.matmul(points, theta1.T)
-    denom_points = torch.matmul(points, theta2.T)
-    transformed_points = numer_points / denom_points
-
-    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
-    # and compute bounding box from 4 transformed points:
-    transformed_points = transformed_points.view(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
-
-    # out_bboxes should be of shape [N boxes, 4]
-
-    return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def perspective_mask(
-    mask: torch.Tensor,
-    perspective_coeffs: List[float],
-    fill: features.FillTypeJIT = None,
-) -> torch.Tensor:
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = perspective_image_tensor(
-        mask, perspective_coeffs=perspective_coeffs, interpolation=InterpolationMode.NEAREST, fill=fill
-    )
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def perspective(
-    inpt: features.InputTypeJIT,
-    perspective_coeffs: List[float],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return perspective_image_tensor(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, features._Feature):
-        return inpt.perspective(perspective_coeffs, interpolation=interpolation, fill=fill)
-    else:
-        return perspective_image_pil(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
-
-
-def elastic_image_tensor(
-    image: torch.Tensor,
-    displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
-) -> torch.Tensor:
-    return _FT.elastic_transform(image, displacement, interpolation=interpolation.value, fill=fill)
-
-
-@torch.jit.unused
-def elastic_image_pil(
-    image: PIL.Image.Image,
-    displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
-) -> PIL.Image.Image:
-    t_img = pil_to_tensor(image)
-    output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill)
-    return to_pil_image(output, mode=image.mode)
-
-
-def elastic_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    displacement: torch.Tensor,
-) -> torch.Tensor:
-    # TODO: add in docstring about approximation we are doing for grid inversion
-    displacement = displacement.to(bounding_box.device)
-
-    original_shape = bounding_box.shape
-    bounding_box = convert_format_bounding_box(
-        bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY
-    ).view(-1, 4)
-
-    # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
-    # Or add image_size arg and check displacement shape
-    image_size = displacement.shape[-3], displacement.shape[-2]
-
-    id_grid = _FT._create_identity_grid(list(image_size)).to(bounding_box.device)
-    # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
-    # This is not an exact inverse of the grid
-    inv_grid = id_grid - displacement
-
-    # Get points from bboxes
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2)
-    index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long)
-    index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long)
-    # Transform points:
-    t_size = torch.tensor(image_size[::-1], device=displacement.device, dtype=displacement.dtype)
-    transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5
-
-    transformed_points = transformed_points.view(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
-    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
-
-    return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False
-    ).view(original_shape)
-
-
-def elastic_mask(
-    mask: torch.Tensor,
-    displacement: torch.Tensor,
-    fill: features.FillTypeJIT = None,
-) -> torch.Tensor:
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = elastic_image_tensor(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST, fill=fill)
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def elastic(
-    inpt: features.InputTypeJIT,
-    displacement: torch.Tensor,
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    fill: features.FillTypeJIT = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill)
-    elif isinstance(inpt, features._Feature):
-        return inpt.elastic(displacement, interpolation=interpolation, fill=fill)
-    else:
-        return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill)
-
-
-elastic_transform = elastic
-
-
-def _center_crop_parse_output_size(output_size: List[int]) -> List[int]:
-    if isinstance(output_size, numbers.Number):
-        return [int(output_size), int(output_size)]
-    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
-        return [output_size[0], output_size[0]]
-    else:
-        return list(output_size)
-
-
-def _center_crop_compute_padding(crop_height: int, crop_width: int, image_height: int, image_width: int) -> List[int]:
-    return [
-        (crop_width - image_width) // 2 if crop_width > image_width else 0,
-        (crop_height - image_height) // 2 if crop_height > image_height else 0,
-        (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
-        (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
-    ]
-
-
-def _center_crop_compute_crop_anchor(
-    crop_height: int, crop_width: int, image_height: int, image_width: int
-) -> Tuple[int, int]:
-    crop_top = int(round((image_height - crop_height) / 2.0))
-    crop_left = int(round((image_width - crop_width) / 2.0))
-    return crop_top, crop_left
-
-
-def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
-    crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    _, image_height, image_width = get_dimensions_image_tensor(image)
-
-    if crop_height > image_height or crop_width > image_width:
-        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        image = pad_image_tensor(image, padding_ltrb, fill=0)
-
-        _, image_height, image_width = get_dimensions_image_tensor(image)
-        if crop_width == image_width and crop_height == image_height:
-            return image
-
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
-    return crop_image_tensor(image, crop_top, crop_left, crop_height, crop_width)
-
-
-@torch.jit.unused
-def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
-    crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    _, image_height, image_width = get_dimensions_image_pil(image)
-
-    if crop_height > image_height or crop_width > image_width:
-        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        image = pad_image_pil(image, padding_ltrb, fill=0)
-
-        _, image_height, image_width = get_dimensions_image_pil(image)
-        if crop_width == image_width and crop_height == image_height:
-            return image
-
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
-    return crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
-
-
-def center_crop_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    image_size: Tuple[int, int],
-    output_size: List[int],
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *image_size)
-    return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
-
-
-def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
-    if mask.ndim < 3:
-        mask = mask.unsqueeze(0)
-        needs_squeeze = True
-    else:
-        needs_squeeze = False
-
-    output = center_crop_image_tensor(image=mask, output_size=output_size)
-
-    if needs_squeeze:
-        output = output.squeeze(0)
-
-    return output
-
-
-def center_crop(inpt: features.InputTypeJIT, output_size: List[int]) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return center_crop_image_tensor(inpt, output_size)
-    elif isinstance(inpt, features._Feature):
-        return inpt.center_crop(output_size)
-    else:
-        return center_crop_image_pil(inpt, output_size)
-
-
-def resized_crop_image_tensor(
-    image: torch.Tensor,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: bool = False,
-) -> torch.Tensor:
-    image = crop_image_tensor(image, top, left, height, width)
-    return resize_image_tensor(image, size, interpolation=interpolation, antialias=antialias)
-
-
-@torch.jit.unused
-def resized_crop_image_pil(
-    image: PIL.Image.Image,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-) -> PIL.Image.Image:
-    image = crop_image_pil(image, top, left, height, width)
-    return resize_image_pil(image, size, interpolation=interpolation)
-
-
-def resized_crop_bounding_box(
-    bounding_box: torch.Tensor,
-    format: features.BoundingBoxFormat,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    bounding_box, _ = crop_bounding_box(bounding_box, format, top, left, height, width)
-    return resize_bounding_box(bounding_box, (height, width), size)
-
-
-def resized_crop_mask(
-    mask: torch.Tensor,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-) -> torch.Tensor:
-    mask = crop_mask(mask, top, left, height, width)
-    return resize_mask(mask, size)
-
-
-def resized_crop(
-    inpt: features.InputTypeJIT,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-    size: List[int],
-    interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        antialias = False if antialias is None else antialias
-        return resized_crop_image_tensor(
-            inpt, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
-        )
-    elif isinstance(inpt, features._Feature):
-        antialias = False if antialias is None else antialias
-        return inpt.resized_crop(top, left, height, width, antialias=antialias, size=size, interpolation=interpolation)
-    else:
-        return resized_crop_image_pil(inpt, top, left, height, width, size=size, interpolation=interpolation)
-
-
-def _parse_five_crop_size(size: List[int]) -> List[int]:
-    if isinstance(size, numbers.Number):
-        size = [int(size), int(size)]
-    elif isinstance(size, (tuple, list)) and len(size) == 1:
-        size = [size[0], size[0]]
-
-    if len(size) != 2:
-        raise ValueError("Please provide only two dimensions (h, w) for size.")
-
-    return size
-
-
-def five_crop_image_tensor(
-    image: torch.Tensor, size: List[int]
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    crop_height, crop_width = _parse_five_crop_size(size)
-    _, image_height, image_width = get_dimensions_image_tensor(image)
-
-    if crop_width > image_width or crop_height > image_height:
-        msg = "Requested crop size {} is bigger than input size {}"
-        raise ValueError(msg.format(size, (image_height, image_width)))
-
-    tl = crop_image_tensor(image, 0, 0, crop_height, crop_width)
-    tr = crop_image_tensor(image, 0, image_width - crop_width, crop_height, crop_width)
-    bl = crop_image_tensor(image, image_height - crop_height, 0, crop_height, crop_width)
-    br = crop_image_tensor(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
-    center = center_crop_image_tensor(image, [crop_height, crop_width])
-
-    return tl, tr, bl, br, center
-
-
-@torch.jit.unused
-def five_crop_image_pil(
-    image: PIL.Image.Image, size: List[int]
-) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
-    crop_height, crop_width = _parse_five_crop_size(size)
-    _, image_height, image_width = get_dimensions_image_pil(image)
-
-    if crop_width > image_width or crop_height > image_height:
-        msg = "Requested crop size {} is bigger than input size {}"
-        raise ValueError(msg.format(size, (image_height, image_width)))
-
-    tl = crop_image_pil(image, 0, 0, crop_height, crop_width)
-    tr = crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
-    bl = crop_image_pil(image, image_height - crop_height, 0, crop_height, crop_width)
-    br = crop_image_pil(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
-    center = center_crop_image_pil(image, [crop_height, crop_width])
-
-    return tl, tr, bl, br, center
-
-
-def five_crop(
-    inpt: features.ImageTypeJIT, size: List[int]
-) -> Tuple[
-    features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT, features.ImageTypeJIT
-]:
-    # TODO: consider breaking BC here to return List[features.ImageTypeJIT] to align this op with `ten_crop`
-    if isinstance(inpt, torch.Tensor):
-        output = five_crop_image_tensor(inpt, size)
-        if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = tuple(features.Image.new_like(inpt, item) for item in output)  # type: ignore[assignment]
-        return output
-    else:  # isinstance(inpt, PIL.Image.Image):
-        return five_crop_image_pil(inpt, size)
-
-
-def ten_crop_image_tensor(image: torch.Tensor, size: List[int], vertical_flip: bool = False) -> List[torch.Tensor]:
-    tl, tr, bl, br, center = five_crop_image_tensor(image, size)
-
-    if vertical_flip:
-        image = vertical_flip_image_tensor(image)
-    else:
-        image = horizontal_flip_image_tensor(image)
-
-    tl_flip, tr_flip, bl_flip, br_flip, center_flip = five_crop_image_tensor(image, size)
-
-    return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
-
-
-@torch.jit.unused
-def ten_crop_image_pil(image: PIL.Image.Image, size: List[int], vertical_flip: bool = False) -> List[PIL.Image.Image]:
-    tl, tr, bl, br, center = five_crop_image_pil(image, size)
-
-    if vertical_flip:
-        image = vertical_flip_image_pil(image)
-    else:
-        image = horizontal_flip_image_pil(image)
-
-    tl_flip, tr_flip, bl_flip, br_flip, center_flip = five_crop_image_pil(image, size)
-
-    return [tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip]
-
-
-def ten_crop(inpt: features.ImageTypeJIT, size: List[int], vertical_flip: bool = False) -> List[features.ImageTypeJIT]:
-    if isinstance(inpt, torch.Tensor):
-        output = ten_crop_image_tensor(inpt, size, vertical_flip=vertical_flip)
-        if not torch.jit.is_scripting() and isinstance(inpt, features.Image):
-            output = [features.Image.new_like(inpt, item) for item in output]
-        return output
-    else:  # isinstance(inpt, PIL.Image.Image):
-        return ten_crop_image_pil(inpt, size, vertical_flip=vertical_flip)
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
deleted file mode 100644
index 90cfffcf276..00000000000
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ /dev/null
@@ -1,228 +0,0 @@
-from typing import cast, List, Optional, Tuple
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.prototype.features import BoundingBoxFormat, ColorSpace
-from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
-
-get_dimensions_image_tensor = _FT.get_dimensions
-get_dimensions_image_pil = _FP.get_dimensions
-
-
-# TODO: Should this be prefixed with `_` similar to other methods that don't get exposed by init?
-def get_chw(image: features.ImageTypeJIT) -> Tuple[int, int, int]:
-    if isinstance(image, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(image, features.Image)):
-        channels, height, width = get_dimensions_image_tensor(image)
-    elif isinstance(image, features.Image):
-        channels = image.num_channels
-        height, width = image.image_size
-    else:  # isinstance(image, PIL.Image.Image)
-        channels, height, width = get_dimensions_image_pil(image)
-    return channels, height, width
-
-
-# The three functions below are here for BC. Whether we want to have two different kernels and how they and the
-# compound version should be named is still under discussion: https://github.com/pytorch/vision/issues/6491
-# Given that these kernels should also support boxes, masks, and videos, it is unlikely that there name will stay.
-# They will either be deprecated or simply aliased to the new kernels if we have reached consensus about the issue
-# detailed above.
-
-
-def get_dimensions(image: features.ImageTypeJIT) -> List[int]:
-    return list(get_chw(image))
-
-
-def get_num_channels(image: features.ImageTypeJIT) -> int:
-    num_channels, *_ = get_chw(image)
-    return num_channels
-
-
-# We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
-# deprecating the old names.
-get_image_num_channels = get_num_channels
-
-
-def get_spatial_size(image: features.ImageTypeJIT) -> List[int]:
-    _, *size = get_chw(image)
-    return size
-
-
-def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor:
-    xyxy = xywh.clone()
-    xyxy[..., 2:] += xyxy[..., :2]
-    return xyxy
-
-
-def _xyxy_to_xywh(xyxy: torch.Tensor) -> torch.Tensor:
-    xywh = xyxy.clone()
-    xywh[..., 2:] -= xywh[..., :2]
-    return xywh
-
-
-def _cxcywh_to_xyxy(cxcywh: torch.Tensor) -> torch.Tensor:
-    cx, cy, w, h = torch.unbind(cxcywh, dim=-1)
-    x1 = cx - 0.5 * w
-    y1 = cy - 0.5 * h
-    x2 = cx + 0.5 * w
-    y2 = cy + 0.5 * h
-    return torch.stack((x1, y1, x2, y2), dim=-1).to(cxcywh.dtype)
-
-
-def _xyxy_to_cxcywh(xyxy: torch.Tensor) -> torch.Tensor:
-    x1, y1, x2, y2 = torch.unbind(xyxy, dim=-1)
-    cx = (x1 + x2) / 2
-    cy = (y1 + y2) / 2
-    w = x2 - x1
-    h = y2 - y1
-    return torch.stack((cx, cy, w, h), dim=-1).to(xyxy.dtype)
-
-
-def convert_format_bounding_box(
-    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, copy: bool = True
-) -> torch.Tensor:
-    if new_format == old_format:
-        if copy:
-            return bounding_box.clone()
-        else:
-            return bounding_box
-
-    if old_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xywh_to_xyxy(bounding_box)
-    elif old_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _cxcywh_to_xyxy(bounding_box)
-
-    if new_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xyxy_to_xywh(bounding_box)
-    elif new_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _xyxy_to_cxcywh(bounding_box)
-
-    return bounding_box
-
-
-def clamp_bounding_box(
-    bounding_box: torch.Tensor, format: BoundingBoxFormat, image_size: Tuple[int, int]
-) -> torch.Tensor:
-    # TODO: (PERF) Possible speed up clamping if we have different implementations for each bbox format.
-    # Not sure if they yield equivalent results.
-    xyxy_boxes = convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY)
-    xyxy_boxes[..., 0::2].clamp_(min=0, max=image_size[1])
-    xyxy_boxes[..., 1::2].clamp_(min=0, max=image_size[0])
-    return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format, copy=False)
-
-
-def _split_alpha(image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    return image[..., :-1, :, :], image[..., -1:, :, :]
-
-
-def _strip_alpha(image: torch.Tensor) -> torch.Tensor:
-    image, alpha = _split_alpha(image)
-    if not torch.all(alpha == _FT._max_value(alpha.dtype)):
-        raise RuntimeError(
-            "Stripping the alpha channel if it contains values other than the max value is not supported."
-        )
-    return image
-
-
-def _add_alpha(image: torch.Tensor, alpha: Optional[torch.Tensor] = None) -> torch.Tensor:
-    if alpha is None:
-        shape = list(image.shape)
-        shape[-3] = 1
-        alpha = torch.full(shape, _FT._max_value(image.dtype), dtype=image.dtype, device=image.device)
-    return torch.cat((image, alpha), dim=-3)
-
-
-def _gray_to_rgb(grayscale: torch.Tensor) -> torch.Tensor:
-    repeats = [1] * grayscale.ndim
-    repeats[-3] = 3
-    return grayscale.repeat(repeats)
-
-
-_rgb_to_gray = _FT.rgb_to_grayscale
-
-
-def convert_color_space_image_tensor(
-    image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True
-) -> torch.Tensor:
-    if new_color_space == old_color_space:
-        if copy:
-            return image.clone()
-        else:
-            return image
-
-    if old_color_space == ColorSpace.OTHER or new_color_space == ColorSpace.OTHER:
-        raise RuntimeError(f"Conversion to or from {ColorSpace.OTHER} is not supported.")
-
-    if old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.GRAY_ALPHA:
-        return _add_alpha(image)
-    elif old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.RGB:
-        return _gray_to_rgb(image)
-    elif old_color_space == ColorSpace.GRAY and new_color_space == ColorSpace.RGB_ALPHA:
-        return _add_alpha(_gray_to_rgb(image))
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.GRAY:
-        return _strip_alpha(image)
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB:
-        return _gray_to_rgb(_strip_alpha(image))
-    elif old_color_space == ColorSpace.GRAY_ALPHA and new_color_space == ColorSpace.RGB_ALPHA:
-        image, alpha = _split_alpha(image)
-        return _add_alpha(_gray_to_rgb(image), alpha)
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY:
-        return _rgb_to_gray(image)
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.GRAY_ALPHA:
-        return _add_alpha(_rgb_to_gray(image))
-    elif old_color_space == ColorSpace.RGB and new_color_space == ColorSpace.RGB_ALPHA:
-        return _add_alpha(image)
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY:
-        return _rgb_to_gray(_strip_alpha(image))
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.GRAY_ALPHA:
-        image, alpha = _split_alpha(image)
-        return _add_alpha(_rgb_to_gray(image), alpha)
-    elif old_color_space == ColorSpace.RGB_ALPHA and new_color_space == ColorSpace.RGB:
-        return _strip_alpha(image)
-    else:
-        raise RuntimeError(f"Conversion from {old_color_space} to {new_color_space} is not supported.")
-
-
-_COLOR_SPACE_TO_PIL_MODE = {
-    ColorSpace.GRAY: "L",
-    ColorSpace.GRAY_ALPHA: "LA",
-    ColorSpace.RGB: "RGB",
-    ColorSpace.RGB_ALPHA: "RGBA",
-}
-
-
-@torch.jit.unused
-def convert_color_space_image_pil(
-    image: PIL.Image.Image, color_space: ColorSpace, copy: bool = True
-) -> PIL.Image.Image:
-    old_mode = image.mode
-    try:
-        new_mode = _COLOR_SPACE_TO_PIL_MODE[color_space]
-    except KeyError:
-        raise ValueError(f"Conversion from {ColorSpace.from_pil_mode(old_mode)} to {color_space} is not supported.")
-
-    if not copy and image.mode == new_mode:
-        return image
-
-    return image.convert(new_mode)
-
-
-def convert_color_space(
-    inpt: features.ImageTypeJIT,
-    color_space: ColorSpace,
-    old_color_space: Optional[ColorSpace] = None,
-    copy: bool = True,
-) -> features.ImageTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Image)):
-        if old_color_space is None:
-            raise RuntimeError(
-                "In order to convert the color space of simple tensor images, "
-                "the `old_color_space=...` parameter needs to be passed."
-            )
-        return convert_color_space_image_tensor(
-            inpt, old_color_space=old_color_space, new_color_space=color_space, copy=copy
-        )
-    elif isinstance(inpt, features.Image):
-        return inpt.to_color_space(color_space, copy=copy)
-    else:
-        return cast(features.ImageTypeJIT, convert_color_space_image_pil(inpt, color_space, copy=copy))
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
deleted file mode 100644
index 03ddf05acaf..00000000000
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from typing import List, Optional
-
-import PIL.Image
-import torch
-from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
-from torchvision.transforms.functional import pil_to_tensor, to_pil_image
-
-normalize_image_tensor = _FT.normalize
-
-
-def normalize(
-    inpt: features.TensorImageTypeJIT, mean: List[float], std: List[float], inplace: bool = False
-) -> torch.Tensor:
-    if torch.jit.is_scripting():
-        correct_type = isinstance(inpt, torch.Tensor)
-    else:
-        correct_type = features.is_simple_tensor(inpt) or isinstance(inpt, features.Image)
-        inpt = inpt.as_subclass(torch.Tensor)  # type: ignore[arg-type]
-    if not correct_type:
-        raise TypeError(f"img should be Tensor Image. Got {type(inpt)}")
-
-    # Image instance after normalization is not Image anymore due to unknown data range
-    # Thus we return Tensor for input Image
-    return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace)
-
-
-def gaussian_blur_image_tensor(
-    image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> torch.Tensor:
-    # TODO: consider deprecating integers from sigma on the future
-    if isinstance(kernel_size, int):
-        kernel_size = [kernel_size, kernel_size]
-    if len(kernel_size) != 2:
-        raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
-    for ksize in kernel_size:
-        if ksize % 2 == 0 or ksize < 0:
-            raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
-
-    if sigma is None:
-        sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
-
-    if sigma is not None and not isinstance(sigma, (int, float, list, tuple)):
-        raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
-    if isinstance(sigma, (int, float)):
-        sigma = [float(sigma), float(sigma)]
-    if isinstance(sigma, (list, tuple)) and len(sigma) == 1:
-        sigma = [sigma[0], sigma[0]]
-    if len(sigma) != 2:
-        raise ValueError(f"If sigma is a sequence, its length should be 2. Got {len(sigma)}")
-    for s in sigma:
-        if s <= 0.0:
-            raise ValueError(f"sigma should have positive values. Got {sigma}")
-
-    return _FT.gaussian_blur(image, kernel_size, sigma)
-
-
-@torch.jit.unused
-def gaussian_blur_image_pil(
-    image: PIL.Image.Image, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> PIL.Image.Image:
-    t_img = pil_to_tensor(image)
-    output = gaussian_blur_image_tensor(t_img, kernel_size=kernel_size, sigma=sigma)
-    return to_pil_image(output, mode=image.mode)
-
-
-def gaussian_blur(
-    inpt: features.InputTypeJIT, kernel_size: List[int], sigma: Optional[List[float]] = None
-) -> features.InputTypeJIT:
-    if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma)
-    elif isinstance(inpt, features._Feature):
-        return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma)
-    else:
-        return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma)
diff --git a/torchvision/prototype/transforms/functional/_type_conversion.py b/torchvision/prototype/transforms/functional/_type_conversion.py
deleted file mode 100644
index b171716ae87..00000000000
--- a/torchvision/prototype/transforms/functional/_type_conversion.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import unittest.mock
-from typing import Any, Dict, Tuple, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from torchvision.io.video import read_video
-from torchvision.prototype import features
-from torchvision.prototype.utils._internal import ReadOnlyTensorBuffer
-from torchvision.transforms import functional as _F
-
-
-@torch.jit.unused
-def decode_image_with_pil(encoded_image: torch.Tensor) -> features.Image:
-    image = torch.as_tensor(np.array(PIL.Image.open(ReadOnlyTensorBuffer(encoded_image)), copy=True))
-    if image.ndim == 2:
-        image = image.unsqueeze(2)
-    return features.Image(image.permute(2, 0, 1))
-
-
-@torch.jit.unused
-def decode_video_with_av(encoded_video: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
-    with unittest.mock.patch("torchvision.io.video.os.path.exists", return_value=True):
-        return read_video(ReadOnlyTensorBuffer(encoded_video))  # type: ignore[arg-type]
-
-
-@torch.jit.unused
-def to_image_tensor(image: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> features.Image:
-    if isinstance(image, np.ndarray):
-        output = torch.from_numpy(image)
-    elif isinstance(image, PIL.Image.Image):
-        output = pil_to_tensor(image)
-    else:  # isinstance(inpt, torch.Tensor):
-        output = image
-    return features.Image(output)
-
-
-to_image_pil = _F.to_pil_image
-pil_to_tensor = _F.pil_to_tensor
-
-# We changed the names to align them with the new naming scheme. Still, `to_pil_image` is
-# prevalent and well understood. Thus, we just alias it without deprecating the old name.
-to_pil_image = to_image_pil
-
-convert_image_dtype = _F.convert_image_dtype
diff --git a/torchvision/prototype/utils/__init__.py b/torchvision/prototype/utils/__init__.py
deleted file mode 100644
index e85a582b483..00000000000
--- a/torchvision/prototype/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import _internal
diff --git a/torchvision/prototype/utils/_internal.py b/torchvision/prototype/utils/_internal.py
deleted file mode 100644
index 3dee4b59a7a..00000000000
--- a/torchvision/prototype/utils/_internal.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import collections.abc
-import difflib
-import io
-import mmap
-import platform
-from typing import BinaryIO, Callable, Collection, Sequence, TypeVar, Union
-
-import numpy as np
-import torch
-from torchvision._utils import sequence_to_str
-
-
-__all__ = [
-    "add_suggestion",
-    "fromfile",
-    "ReadOnlyTensorBuffer",
-]
-
-
-def add_suggestion(
-    msg: str,
-    *,
-    word: str,
-    possibilities: Collection[str],
-    close_match_hint: Callable[[str], str] = lambda close_match: f"Did you mean '{close_match}'?",
-    alternative_hint: Callable[
-        [Sequence[str]], str
-    ] = lambda possibilities: f"Can be {sequence_to_str(possibilities, separate_last='or ')}.",
-) -> str:
-    if not isinstance(possibilities, collections.abc.Sequence):
-        possibilities = sorted(possibilities)
-    suggestions = difflib.get_close_matches(word, possibilities, 1)
-    hint = close_match_hint(suggestions[0]) if suggestions else alternative_hint(possibilities)
-    if not hint:
-        return msg
-
-    return f"{msg.strip()} {hint}"
-
-
-D = TypeVar("D")
-
-
-def _read_mutable_buffer_fallback(file: BinaryIO, count: int, item_size: int) -> bytearray:
-    # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable
-    return bytearray(file.read(-1 if count == -1 else count * item_size))
-
-
-def fromfile(
-    file: BinaryIO,
-    *,
-    dtype: torch.dtype,
-    byte_order: str,
-    count: int = -1,
-) -> torch.Tensor:
-    """Construct a tensor from a binary file.
-    .. note::
-        This function is similar to :func:`numpy.fromfile` with two notable differences:
-        1. This function only accepts an open binary file, but not a path to it.
-        2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that
-            concept.
-    .. note::
-        If the ``file`` was opened in update mode, i.e. "r+b" or "w+b", reading data is much faster. Be aware that as
-        long as the file is still open, inplace operations on the returned tensor will reflect back to the file.
-    Args:
-        file (IO): Open binary file.
-        dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor.
-        byte_order (str): Byte order of the data. Can be "little" or "big" endian.
-        count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
-    """
-    byte_order = "<" if byte_order == "little" else ">"
-    char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
-    item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-    np_dtype = byte_order + char + str(item_size)
-
-    buffer: Union[memoryview, bytearray]
-    if platform.system() != "Windows":
-        # PyTorch does not support tensors with underlying read-only memory. In case
-        # - the file has a .fileno(),
-        # - the file was opened for updating, i.e. 'r+b' or 'w+b',
-        # - the file is seekable
-        # we can avoid copying the data for performance. Otherwise we fall back to simply .read() the data and copy it
-        # to a mutable location afterwards.
-        try:
-            buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
-            # Reading from the memoryview does not advance the file cursor, so we have to do it manually.
-            file.seek(*(0, io.SEEK_END) if count == -1 else (count * item_size, io.SEEK_CUR))
-        except (AttributeError, PermissionError, io.UnsupportedOperation):
-            buffer = _read_mutable_buffer_fallback(file, count, item_size)
-    else:
-        # On Windows just trying to call mmap.mmap() on a file that does not support it, may corrupt the internal state
-        # so no data can be read afterwards. Thus, we simply ignore the possible speed-up.
-        buffer = _read_mutable_buffer_fallback(file, count, item_size)
-
-    # We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we
-    # read the data with np.frombuffer() with the correct byte order and convert it to the native one with the
-    # successive .astype() call.
-    return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False))
-
-
-class ReadOnlyTensorBuffer:
-    def __init__(self, tensor: torch.Tensor) -> None:
-        self._memory = memoryview(tensor.numpy())
-        self._cursor: int = 0
-
-    def tell(self) -> int:
-        return self._cursor
-
-    def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
-        if whence == io.SEEK_SET:
-            self._cursor = offset
-        elif whence == io.SEEK_CUR:
-            self._cursor += offset
-            pass
-        elif whence == io.SEEK_END:
-            self._cursor = len(self._memory) + offset
-        else:
-            raise ValueError(
-                f"'whence' should be ``{io.SEEK_SET}``, ``{io.SEEK_CUR}``, or ``{io.SEEK_END}``, "
-                f"but got {repr(whence)} instead"
-            )
-        return self.tell()
-
-    def read(self, size: int = -1) -> bytes:
-        cursor = self.tell()
-        offset, whence = (0, io.SEEK_END) if size == -1 else (size, io.SEEK_CUR)
-        return self._memory[slice(cursor, self.seek(offset, whence))].tobytes()