forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
__init__.py
51 lines (44 loc) 路 2.05 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import inspect
import re
from hashlib import sha256
from typing import List
from .audiofolder import audiofolder
from .csv import csv
from .imagefolder import imagefolder
from .json import json
from .pandas import pandas
from .parquet import parquet
from .text import text
def _hash_python_lines(lines: List[str]) -> str:
filtered_lines = []
for line in lines:
line = re.sub(r"#.*", "", line) # remove comments
if line:
filtered_lines.append(line)
full_str = "\n".join(filtered_lines)
# Make a hash from all this code
full_bytes = full_str.encode("utf-8")
return sha256(full_bytes).hexdigest()
# get importable module names and hash for caching
_PACKAGED_DATASETS_MODULES = {
"csv": (csv.__name__, _hash_python_lines(inspect.getsource(csv).splitlines())),
"json": (json.__name__, _hash_python_lines(inspect.getsource(json).splitlines())),
"pandas": (pandas.__name__, _hash_python_lines(inspect.getsource(pandas).splitlines())),
"parquet": (parquet.__name__, _hash_python_lines(inspect.getsource(parquet).splitlines())),
"text": (text.__name__, _hash_python_lines(inspect.getsource(text).splitlines())),
"imagefolder": (imagefolder.__name__, _hash_python_lines(inspect.getsource(imagefolder).splitlines())),
"audiofolder": (audiofolder.__name__, _hash_python_lines(inspect.getsource(audiofolder).splitlines())),
}
_EXTENSION_TO_MODULE = {
"csv": ("csv", {}),
"tsv": ("csv", {"sep": "\t"}),
"json": ("json", {}),
"jsonl": ("json", {}),
"parquet": ("parquet", {}),
"txt": ("text", {}),
}
_EXTENSION_TO_MODULE.update({ext[1:]: ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS})
_EXTENSION_TO_MODULE.update({ext[1:].upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS})
_EXTENSION_TO_MODULE.update({ext[1:]: ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS})
_EXTENSION_TO_MODULE.update({ext[1:].upper(): ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS})
_MODULE_SUPPORTS_METADATA = {"imagefolder", "audiofolder"}