Skip to content

Commit

Permalink
decode mp3 with librosa if torchaudio >= 0.12 doesn't work as a tempo…
Browse files Browse the repository at this point in the history
…rary workaround (#4923)

* decode mp3 with librosa if torchaudio is > 0.12 (ideally version of ffmpeg should be checked too)

* decode mp3 with torchaudio>=0.12 if it works (instead of librosa)

* fix incorrect marks for mp3 tests (require torchaudio, not sndfile)

* add tests for latest torchaudio + separate stage in CI for it (first try)

* install ffmpeg only on ubuntu

* use mock to emulate torchaudio fail, add tests for librosa (not all of them)

* test torchaudio_latest only on ubuntu

* try/except decoding with librosa for file-like objects

* more tests for latest torchaudio, should be comlpete set now

* replace logging with warnings

* fix tests: catch warnings with a pytest context manager

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
  • Loading branch information
polinaeterna and lhoestq committed Sep 20, 2022
1 parent 8ba0522 commit 142404f
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 17 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/ci.yml
Expand Up @@ -72,3 +72,13 @@ jobs:
- name: Test with pytest
run: |
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
- name: Install dependencies to test torchaudio>=0.12 on Ubuntu
if: ${{ matrix.os == 'ubuntu-latest' }}
run: |
pip uninstall -y torchaudio torch
pip install "torchaudio>=0.12"
sudo apt-get -y install ffmpeg
- name: Test torchaudio>=0.12 on Ubuntu
if: ${{ matrix.os == 'ubuntu-latest' }}
run: |
python -m pytest -rfExX -m torchaudio_latest -n 2 --dist loadfile -sv ./tests/features/test_audio.py
58 changes: 45 additions & 13 deletions src/datasets/features/audio.py
@@ -1,4 +1,5 @@
import os
import warnings
from dataclasses import dataclass, field
from io import BytesIO
from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
Expand Down Expand Up @@ -268,7 +269,7 @@ def _decode_non_mp3_file_like(self, file, format=None):
if version.parse(sf.__libsndfile_version__) < version.parse("1.0.30"):
raise RuntimeError(
"Decoding .opus files requires 'libsndfile'>=1.0.30, "
+ "it can be installed via conda: `conda install -c conda-forge libsndfile>=1.0.30`"
+ 'it can be installed via conda: `conda install -c conda-forge "libsndfile>=1.0.30"`'
)
array, sampling_rate = sf.read(file)
array = array.T
Expand All @@ -282,19 +283,44 @@ def _decode_non_mp3_file_like(self, file, format=None):
def _decode_mp3(self, path_or_file):
try:
import torchaudio
import torchaudio.transforms as T
except ImportError as err:
raise ImportError(
"Decoding 'mp3' audio files, requires 'torchaudio<0.12.0': pip install 'torchaudio<0.12.0'"
) from err
if not version.parse(torchaudio.__version__) < version.parse("0.12.0"):
raise RuntimeError(
"Decoding 'mp3' audio files, requires 'torchaudio<0.12.0': pip install 'torchaudio<0.12.0'"
)
try:
torchaudio.set_audio_backend("sox_io")
except RuntimeError as err:
raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
raise ImportError("To support decoding 'mp3' audio files, please install 'torchaudio'.") from err
if version.parse(torchaudio.__version__) < version.parse("0.12.0"):
try:
torchaudio.set_audio_backend("sox_io")
except RuntimeError as err:
raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
array, sampling_rate = self._decode_mp3_torchaudio(path_or_file)
else:
try: # try torchaudio anyway because sometimes it works (depending on the os and os packages installed)
array, sampling_rate = self._decode_mp3_torchaudio(path_or_file)
except RuntimeError:
try:
# flake8: noqa
import librosa
except ImportError as err:
raise ImportError(
"Your version of `torchaudio` (>=0.12.0) doesn't support decoding 'mp3' files on your machine. "
"To support 'mp3' decoding with `torchaudio>=0.12.0`, please install `ffmpeg>=4` system package "
'or downgrade `torchaudio` to <0.12: `pip install "torchaudio<0.12"`. '
"To support decoding 'mp3' audio files without `torchaudio`, please install `librosa`: "
"`pip install librosa`. Note that decoding will be extremely slow in that case."
) from err
# try to decode with librosa for torchaudio>=0.12.0 as a workaround
warnings.warn("Decoding mp3 with `librosa` instead of `torchaudio`, decoding is slow.")
try:
array, sampling_rate = self._decode_mp3_librosa(path_or_file)
except RuntimeError as err:
raise RuntimeError(
"Decoding of 'mp3' failed, probably because of streaming mode "
"(`librosa` cannot decode 'mp3' file-like objects, only path-like)."
) from err

return array, sampling_rate

def _decode_mp3_torchaudio(self, path_or_file):
import torchaudio
import torchaudio.transforms as T

array, sampling_rate = torchaudio.load(path_or_file, format="mp3")
if self.sampling_rate and self.sampling_rate != sampling_rate:
Expand All @@ -306,3 +332,9 @@ def _decode_mp3(self, path_or_file):
if self.mono:
array = array.mean(axis=0)
return array, sampling_rate

def _decode_mp3_librosa(self, path_or_file):
import librosa

array, sampling_rate = librosa.load(path_or_file, mono=self.mono, sr=self.sampling_rate)
return array, sampling_rate
4 changes: 4 additions & 0 deletions tests/conftest.py
Expand Up @@ -15,6 +15,10 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.unit)


def pytest_configure(config):
config.addinivalue_line("markers", "torchaudio_latest: mark test to run with torchaudio>=0.12")


@pytest.fixture(autouse=True)
def set_test_cache_config(tmp_path_factory, monkeypatch):
# test_hf_cache_home = tmp_path_factory.mktemp("cache") # TODO: why a cache dir per test function does not work?
Expand Down
171 changes: 168 additions & 3 deletions tests/features/test_audio.py
@@ -1,13 +1,21 @@
import os
import tarfile
from contextlib import nullcontext
from unittest.mock import patch

import pyarrow as pa
import pytest

from datasets import Dataset, concatenate_datasets, load_dataset
from datasets.features import Audio, Features, Sequence, Value

from ..utils import require_libsndfile_with_opus, require_sndfile, require_sox, require_torchaudio
from ..utils import (
require_libsndfile_with_opus,
require_sndfile,
require_sox,
require_torchaudio,
require_torchaudio_latest,
)


@pytest.fixture()
Expand Down Expand Up @@ -135,6 +143,26 @@ def test_audio_decode_example_mp3(shared_datadir):
assert decoded_example["sampling_rate"] == 44100


@pytest.mark.torchaudio_latest
@require_torchaudio_latest
@pytest.mark.parametrize("torchaudio_failed", [False, True])
def test_audio_decode_example_mp3_torchaudio_latest(shared_datadir, torchaudio_failed):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
audio = Audio()

with patch("torchaudio.load") if torchaudio_failed else nullcontext() as load_mock, pytest.warns(
UserWarning, match=r"Decoding mp3 with `librosa` instead of `torchaudio`.+?"
) if torchaudio_failed else nullcontext():

if torchaudio_failed:
load_mock.side_effect = RuntimeError()

decoded_example = audio.decode_example(audio.encode_example(audio_path))
assert decoded_example["path"] == audio_path
assert decoded_example["array"].shape == (110592,)
assert decoded_example["sampling_rate"] == 44100


@require_libsndfile_with_opus
def test_audio_decode_example_opus(shared_datadir):
audio_path = str(shared_datadir / "test_audio_48000.opus")
Expand Down Expand Up @@ -178,6 +206,34 @@ def test_audio_resampling_mp3_different_sampling_rates(shared_datadir):
assert decoded_example["sampling_rate"] == 48000


@pytest.mark.torchaudio_latest
@require_torchaudio_latest
@pytest.mark.parametrize("torchaudio_failed", [False, True])
def test_audio_resampling_mp3_different_sampling_rates_torchaudio_latest(shared_datadir, torchaudio_failed):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
audio_path2 = str(shared_datadir / "test_audio_16000.mp3")
audio = Audio(sampling_rate=48000)

# if torchaudio>=0.12 failed, mp3 must be decoded anyway (with librosa)
with patch("torchaudio.load") if torchaudio_failed else nullcontext() as load_mock, pytest.warns(
UserWarning, match=r"Decoding mp3 with `librosa` instead of `torchaudio`.+?"
) if torchaudio_failed else nullcontext():
if torchaudio_failed:
load_mock.side_effect = RuntimeError()

decoded_example = audio.decode_example(audio.encode_example(audio_path))
assert decoded_example.keys() == {"path", "array", "sampling_rate"}
assert decoded_example["path"] == audio_path
assert decoded_example["array"].shape == (120373,)
assert decoded_example["sampling_rate"] == 48000

decoded_example = audio.decode_example(audio.encode_example(audio_path2))
assert decoded_example.keys() == {"path", "array", "sampling_rate"}
assert decoded_example["path"] == audio_path2
assert decoded_example["array"].shape == (122688,)
assert decoded_example["sampling_rate"] == 48000


@require_sndfile
def test_dataset_with_audio_feature(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -266,6 +322,38 @@ def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
assert column[0]["sampling_rate"] == 44100


@pytest.mark.torchaudio_latest
@require_torchaudio_latest
def test_dataset_with_audio_feature_tar_mp3_torchaudio_latest(tar_mp3_path):
# no test for librosa here because it doesn't support file-like objects, only paths
audio_filename = "test_audio_44100.mp3"
data = {"audio": []}
for file_path, file_obj in iter_archive(tar_mp3_path):
data["audio"].append({"path": file_path, "bytes": file_obj.read()})
break
features = Features({"audio": Audio()})
dset = Dataset.from_dict(data, features=features)
item = dset[0]
assert item.keys() == {"audio"}
assert item["audio"].keys() == {"path", "array", "sampling_rate"}
assert item["audio"]["path"] == audio_filename
assert item["audio"]["array"].shape == (110592,)
assert item["audio"]["sampling_rate"] == 44100
batch = dset[:1]
assert batch.keys() == {"audio"}
assert len(batch["audio"]) == 1
assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
assert batch["audio"][0]["path"] == audio_filename
assert batch["audio"][0]["array"].shape == (110592,)
assert batch["audio"][0]["sampling_rate"] == 44100
column = dset["audio"]
assert len(column) == 1
assert column[0].keys() == {"path", "array", "sampling_rate"}
assert column[0]["path"] == audio_filename
assert column[0]["array"].shape == (110592,)
assert column[0]["sampling_rate"] == 44100


@require_sndfile
def test_dataset_with_audio_feature_with_none():
data = {"audio": [None]}
Expand Down Expand Up @@ -328,7 +416,7 @@ def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir):


@require_sox
@require_sndfile
@require_torchaudio
def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
data = {"audio": [audio_path]}
Expand All @@ -355,6 +443,43 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir):
assert column[0]["sampling_rate"] == 16000


@pytest.mark.torchaudio_latest
@require_torchaudio_latest
@pytest.mark.parametrize("torchaudio_failed", [False, True])
def test_resampling_at_loading_dataset_with_audio_feature_mp3_torchaudio_latest(shared_datadir, torchaudio_failed):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
data = {"audio": [audio_path]}
features = Features({"audio": Audio(sampling_rate=16000)})
dset = Dataset.from_dict(data, features=features)

# if torchaudio>=0.12 failed, mp3 must be decoded anyway (with librosa)
with patch("torchaudio.load") if torchaudio_failed else nullcontext() as load_mock, pytest.warns(
UserWarning, match=r"Decoding mp3 with `librosa` instead of `torchaudio`.+?"
) if torchaudio_failed else nullcontext():
if torchaudio_failed:
load_mock.side_effect = RuntimeError()

item = dset[0]
assert item.keys() == {"audio"}
assert item["audio"].keys() == {"path", "array", "sampling_rate"}
assert item["audio"]["path"] == audio_path
assert item["audio"]["array"].shape == (40125,)
assert item["audio"]["sampling_rate"] == 16000
batch = dset[:1]
assert batch.keys() == {"audio"}
assert len(batch["audio"]) == 1
assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
assert batch["audio"][0]["path"] == audio_path
assert batch["audio"][0]["array"].shape == (40125,)
assert batch["audio"][0]["sampling_rate"] == 16000
column = dset["audio"]
assert len(column) == 1
assert column[0].keys() == {"path", "array", "sampling_rate"}
assert column[0]["path"] == audio_path
assert column[0]["array"].shape == (40125,)
assert column[0]["sampling_rate"] == 16000


@require_sndfile
def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -386,7 +511,7 @@ def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir):


@require_sox
@require_sndfile
@require_torchaudio
def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
data = {"audio": [audio_path]}
Expand Down Expand Up @@ -416,6 +541,46 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir)
assert column[0]["sampling_rate"] == 16000


@pytest.mark.torchaudio_latest
@require_torchaudio_latest
@pytest.mark.parametrize("torchaudio_failed", [False, True])
def test_resampling_after_loading_dataset_with_audio_feature_mp3_torchaudio_latest(shared_datadir, torchaudio_failed):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
data = {"audio": [audio_path]}
features = Features({"audio": Audio()})
dset = Dataset.from_dict(data, features=features)

# if torchaudio>=0.12 failed, mp3 must be decoded anyway (with librosa)
with patch("torchaudio.load") if torchaudio_failed else nullcontext() as load_mock, pytest.warns(
UserWarning, match=r"Decoding mp3 with `librosa` instead of `torchaudio`.+?"
) if torchaudio_failed else nullcontext():
if torchaudio_failed:
load_mock.side_effect = RuntimeError()

item = dset[0]
assert item["audio"]["sampling_rate"] == 44100
dset = dset.cast_column("audio", Audio(sampling_rate=16000))
item = dset[0]
assert item.keys() == {"audio"}
assert item["audio"].keys() == {"path", "array", "sampling_rate"}
assert item["audio"]["path"] == audio_path
assert item["audio"]["array"].shape == (40125,)
assert item["audio"]["sampling_rate"] == 16000
batch = dset[:1]
assert batch.keys() == {"audio"}
assert len(batch["audio"]) == 1
assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
assert batch["audio"][0]["path"] == audio_path
assert batch["audio"][0]["array"].shape == (40125,)
assert batch["audio"][0]["sampling_rate"] == 16000
column = dset["audio"]
assert len(column) == 1
assert column[0].keys() == {"path", "array", "sampling_rate"}
assert column[0]["path"] == audio_path
assert column[0]["array"].shape == (40125,)
assert column[0]["sampling_rate"] == 16000


@pytest.mark.parametrize(
"build_data",
[
Expand Down
11 changes: 10 additions & 1 deletion tests/utils.py
Expand Up @@ -64,7 +64,16 @@ def parse_flag_from_env(key, default=False):
find_library("sox") is None,
reason="test requires sox OS dependency; only available on non-Windows: 'sudo apt-get install sox'",
)
require_torchaudio = pytest.mark.skipif(find_spec("torchaudio") is None, reason="test requires torchaudio")
require_torchaudio = pytest.mark.skipif(
find_spec("torchaudio") is None
or version.parse(import_module("torchaudio").__version__) >= version.parse("0.12.0"),
reason="test requires torchaudio<0.12",
)
require_torchaudio_latest = pytest.mark.skipif(
find_spec("torchaudio") is None
or version.parse(import_module("torchaudio").__version__) < version.parse("0.12.0"),
reason="test requires torchaudio>=0.12",
)


def require_beam(test_case):
Expand Down

1 comment on commit 142404f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.012844 / 0.011353 (0.001491) 0.005546 / 0.011008 (-0.005462) 0.039397 / 0.038508 (0.000889) 0.040313 / 0.023109 (0.017204) 0.412447 / 0.275898 (0.136549) 0.490843 / 0.323480 (0.167363) 0.007651 / 0.007986 (-0.000335) 0.006165 / 0.004328 (0.001837) 0.009349 / 0.004250 (0.005099) 0.058385 / 0.037052 (0.021332) 0.441486 / 0.258489 (0.182997) 0.491014 / 0.293841 (0.197173) 0.050618 / 0.128546 (-0.077928) 0.016079 / 0.075646 (-0.059567) 0.340902 / 0.419271 (-0.078370) 0.070680 / 0.043533 (0.027147) 0.428670 / 0.255139 (0.173531) 0.423359 / 0.283200 (0.140159) 0.119648 / 0.141683 (-0.022035) 1.902936 / 1.452155 (0.450782) 1.978545 / 1.492716 (0.485829)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.311158 / 0.018006 (0.293152) 0.596473 / 0.000490 (0.595984) 0.001188 / 0.000200 (0.000988) 0.000142 / 0.000054 (0.000088)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.030538 / 0.037411 (-0.006873) 0.126683 / 0.014526 (0.112157) 0.141563 / 0.176557 (-0.034994) 0.201562 / 0.737135 (-0.535574) 0.149388 / 0.296338 (-0.146951)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.664703 / 0.215209 (0.449494) 6.426338 / 2.077655 (4.348683) 2.689811 / 1.504120 (1.185691) 2.283971 / 1.541195 (0.742776) 2.297527 / 1.468490 (0.829037) 0.759271 / 4.584777 (-3.825506) 5.876841 / 3.745712 (2.131129) 5.303476 / 5.269862 (0.033614) 2.914138 / 4.565676 (-1.651539) 0.092394 / 0.424275 (-0.331881) 0.015364 / 0.007607 (0.007757) 0.797867 / 0.226044 (0.571822) 8.128731 / 2.268929 (5.859802) 3.375149 / 55.444624 (-52.069476) 2.680923 / 6.876477 (-4.195554) 2.953092 / 2.142072 (0.811019) 0.979433 / 4.805227 (-3.825794) 0.196643 / 6.500664 (-6.304021) 0.081538 / 0.075469 (0.006068)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 2.045620 / 1.841788 (0.203832) 18.421031 / 8.074308 (10.346723) 44.181219 / 10.191392 (33.989827) 1.235825 / 0.680424 (0.555401) 0.816255 / 0.534201 (0.282054) 0.531875 / 0.579283 (-0.047408) 0.668184 / 0.434364 (0.233820) 0.372898 / 0.540337 (-0.167440) 0.398466 / 1.386936 (-0.988470)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008653 / 0.011353 (-0.002700) 0.005603 / 0.011008 (-0.005405) 0.038452 / 0.038508 (-0.000057) 0.038416 / 0.023109 (0.015306) 0.519037 / 0.275898 (0.243139) 0.626340 / 0.323480 (0.302860) 0.005073 / 0.007986 (-0.002913) 0.005156 / 0.004328 (0.000827) 0.007505 / 0.004250 (0.003255) 0.047665 / 0.037052 (0.010613) 0.507704 / 0.258489 (0.249215) 0.588794 / 0.293841 (0.294953) 0.048910 / 0.128546 (-0.079637) 0.014631 / 0.075646 (-0.061015) 0.371259 / 0.419271 (-0.048013) 0.071651 / 0.043533 (0.028118) 0.534344 / 0.255139 (0.279205) 0.568312 / 0.283200 (0.285112) 0.121908 / 0.141683 (-0.019775) 1.956110 / 1.452155 (0.503955) 1.994512 / 1.492716 (0.501796)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.304382 / 0.018006 (0.286376) 0.605044 / 0.000490 (0.604554) 0.008333 / 0.000200 (0.008133) 0.000132 / 0.000054 (0.000078)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.031557 / 0.037411 (-0.005855) 0.143703 / 0.014526 (0.129177) 0.148345 / 0.176557 (-0.028212) 0.199374 / 0.737135 (-0.537761) 0.156375 / 0.296338 (-0.139964)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.634142 / 0.215209 (0.418933) 6.668779 / 2.077655 (4.591124) 2.916332 / 1.504120 (1.412212) 2.579499 / 1.541195 (1.038305) 2.570470 / 1.468490 (1.101980) 0.863389 / 4.584777 (-3.721388) 6.144042 / 3.745712 (2.398330) 3.274132 / 5.269862 (-1.995730) 2.048455 / 4.565676 (-2.517222) 0.096836 / 0.424275 (-0.327439) 0.014568 / 0.007607 (0.006961) 0.861857 / 0.226044 (0.635813) 8.402585 / 2.268929 (6.133656) 3.843762 / 55.444624 (-51.600863) 2.882419 / 6.876477 (-3.994058) 3.190213 / 2.142072 (1.048141) 1.042322 / 4.805227 (-3.762905) 0.224966 / 6.500664 (-6.275699) 0.084048 / 0.075469 (0.008579)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 2.141178 / 1.841788 (0.299391) 17.606715 / 8.074308 (9.532407) 43.768326 / 10.191392 (33.576934) 1.264420 / 0.680424 (0.583996) 0.858414 / 0.534201 (0.324213) 0.536950 / 0.579283 (-0.042333) 0.664361 / 0.434364 (0.229997) 0.385625 / 0.540337 (-0.154712) 0.402487 / 1.386936 (-0.984449)

CML watermark

Please sign in to comment.