Skip to content

Commit

Permalink
add threadmap to load_from_disk huggingface#2252
Browse files Browse the repository at this point in the history
  • Loading branch information
kkoutini committed Dec 6, 2023
1 parent d78f070 commit 555c8a3
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import pyarrow.compute as pc
from huggingface_hub import CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
from multiprocess import Pool
from tqdm.contrib.concurrent import thread_map

from . import config
from .arrow_reader import ArrowReader
Expand Down Expand Up @@ -1703,9 +1704,15 @@ def load_from_disk(
)
keep_in_memory = keep_in_memory if keep_in_memory is not None else is_small_dataset(dataset_size)
table_cls = InMemoryTable if keep_in_memory else MemoryMappedTable

arrow_table = concat_tables(
table_cls.from_file(posixpath.join(dest_dataset_path, data_file["filename"]))
for data_file in state["_data_files"]
thread_map(
table_cls.from_file,
[posixpath.join(dest_dataset_path, data_file["filename"]) for data_file in state["_data_files"]],
tqdm_class=hf_tqdm,
desc="Loading dataset from disk",
disable=len(state["_data_files"]) <= 16,
)
)

split = state["_split"]
Expand Down

0 comments on commit 555c8a3

Please sign in to comment.