Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine Bf16 test for deepspeed #17734

Merged
merged 3 commits into from Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/transformers/utils/__init__.py
Expand Up @@ -125,6 +125,8 @@
is_tokenizers_available,
is_torch_available,
is_torch_bf16_available,
is_torch_bf16_cpu_available,
is_torch_bf16_gpu_available,
is_torch_cuda_available,
is_torch_fx_available,
is_torch_fx_proxy,
Expand Down
36 changes: 24 additions & 12 deletions src/transformers/utils/import_utils.py
Expand Up @@ -272,7 +272,7 @@ def is_torch_cuda_available():
return False


def is_torch_bf16_available():
def is_torch_bf16_gpu_available():
if not is_torch_available():
return False

Expand All @@ -288,30 +288,42 @@ def is_torch_bf16_available():
# 4. torch.autocast exists
# XXX: one problem here is that it may give invalid results on mixed gpus setup, so it's
# really only correct for the 0th gpu (or currently set default device if different from 0)
is_torch_gpu_bf16_available = True
is_torch_cpu_bf16_available = True
if version.parse(torch.__version__) < version.parse("1.10"):
is_torch_gpu_bf16_available = False
is_torch_cpu_bf16_available = False
return False

if torch.cuda.is_available() and torch.version.cuda is not None:
if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
is_torch_gpu_bf16_available = False
return False
if int(torch.version.cuda.split(".")[0]) < 11:
is_torch_gpu_bf16_available = False
return False
if not hasattr(torch.cuda.amp, "autocast"):
is_torch_gpu_bf16_available = False
return False
else:
is_torch_gpu_bf16_available = False
return False

return True


def is_torch_bf16_cpu_available():
if not is_torch_available():
return False

import torch

if version.parse(torch.__version__) < version.parse("1.10"):
return False

# checking CPU
try:
# multiple levels of AttributeError depending on the pytorch version so do them all in one check
_ = torch.cpu.amp.autocast
except AttributeError:
is_torch_cpu_bf16_available = False
return False

return True

return is_torch_cpu_bf16_available or is_torch_gpu_bf16_available

def is_torch_bf16_available():
return is_torch_bf16_cpu_available() or is_torch_bf16_gpu_available()
Comment on lines +325 to +326
Copy link
Contributor

@stas00 stas00 Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that you have split it up into 2 specific components - I think this is ambiguous - what does it actually mean from the usage point of view?

say, a user has a cpu supporting bf16, but they are actually planning to use a gpu, which may not support bf16, so this will return True and then their code will either fail or run really slow.

I know that you haven't added this in this PR, I missed that expansion to add cpu checks in the previous PR created this ambiguity in the first place.

What do you think?

The original is_torch_bf16_available was just doing gpu checks, so perhaps we deprecate it and alias to is_torch_bf16_gpu_available

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def is_torch_bf16_available(...):
    warn(deprecated)
    return is_torch_bf16_gpu_available(...)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is used in the Trainer for the CPU intel integration so your suggestion will just break that new integration.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's correct, we rename it in the Trainer, as it's incorrect - it should be doing a cpu check as introduced in this PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can do in the next PR



def is_torch_tf32_available():
Expand Down
6 changes: 3 additions & 3 deletions tests/deepspeed/test_deepspeed.py
Expand Up @@ -42,7 +42,7 @@
slow,
)
from transformers.trainer_utils import get_last_checkpoint, set_seed
from transformers.utils import WEIGHTS_NAME, is_torch_bf16_available
from transformers.utils import WEIGHTS_NAME, is_torch_bf16_gpu_available


if is_torch_available():
Expand Down Expand Up @@ -129,7 +129,7 @@ def get_launcher(distributed=False):
BF16 = "bf16"

stages = [ZERO2, ZERO3]
if is_torch_bf16_available():
if is_torch_bf16_gpu_available():
dtypes = [FP16, BF16]
else:
dtypes = [FP16]
Expand Down Expand Up @@ -920,7 +920,7 @@ def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
@require_torch_multi_gpu
@parameterized.expand(["bf16", "fp16", "fp32"])
def test_inference(self, dtype):
if dtype == "bf16" and not is_torch_bf16_available():
if dtype == "bf16" and not is_torch_bf16_gpu_available():
self.skipTest("test requires bfloat16 hardware support")

# this is just inference, so no optimizer should be loaded
Expand Down