From 7e401fb9952cad9af0528712384bd20de329991c Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Wed, 27 Jul 2022 08:21:58 -0400 Subject: [PATCH 1/2] Remove all uses of six --- .../research_projects/tapex/wikisql_utils.py | 4 +--- .../deberta_v2/tokenization_deberta_v2.py | 19 +++++----------- .../models/flaubert/tokenization_flaubert.py | 22 +++++++++++++------ 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py index 9147fdc882e4b..3028e81ad481f 100644 --- a/examples/research_projects/tapex/wikisql_utils.py +++ b/examples/research_projects/tapex/wikisql_utils.py @@ -23,8 +23,6 @@ # Original: https://github.com/google-research/tapas/master/wikisql_utils.py from typing import Any, List, Text -import six - EMPTY_ANSWER = "none" EMPTY_ANSWER_AGG = "none" @@ -49,7 +47,7 @@ def convert_to_float(value): return value if isinstance(value, int): return float(value) - if not isinstance(value, six.string_types): + if not isinstance(value, str): raise ValueError("Argument value is not a string. Can't parse it as float") sanitized = value diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index 123afacf822ca..9ac28c82cd614 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -19,7 +19,6 @@ from typing import Any, Dict, List, Optional, Tuple import sentencepiece as sp -import six from ...tokenization_utils import PreTrainedTokenizer @@ -523,17 +522,9 @@ def _is_punctuation(char): def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError(f"Unsupported string type: {type(text)}") - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - else: - raise ValueError(f"Unsupported string type: {type(text)}") + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") else: - raise ValueError("Not running on Python2 or Python 3?") + raise ValueError(f"Unsupported string type: {type(text)}") diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py index 4fbb3783d8a38..7fad11e20e74a 100644 --- a/src/transformers/models/flaubert/tokenization_flaubert.py +++ b/src/transformers/models/flaubert/tokenization_flaubert.py @@ -17,8 +17,6 @@ import unicodedata -import six - from ...utils import logging from ..xlm.tokenization_xlm import XLMTokenizer @@ -72,20 +70,30 @@ } +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError(f"Unsupported string type: {type(text)}") + + def convert_to_unicode(text): """ Converts `text` to Unicode (if it's not already), assuming UTF-8 input. """ - # six_ensure_text is copied from https://github.com/benjaminp/six - def six_ensure_text(s, encoding="utf-8", errors="strict"): - if isinstance(s, six.binary_type): + + def ensure_text(s, encoding="utf-8", errors="strict"): + if isinstance(s, bytes): return s.decode(encoding, errors) - elif isinstance(s, six.text_type): + elif isinstance(s, str): return s else: raise TypeError(f"not expecting type '{type(s)}'") - return six_ensure_text(text, encoding="utf-8", errors="ignore") + return ensure_text(text, encoding="utf-8", errors="ignore") class FlaubertTokenizer(XLMTokenizer): From ded2daf750b65e28b79202e1f0849d0f24bb8121 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Wed, 27 Jul 2022 08:27:38 -0400 Subject: [PATCH 2/2] fix quality --- .../models/flaubert/tokenization_flaubert.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py index 7fad11e20e74a..5d5ad2a657d1b 100644 --- a/src/transformers/models/flaubert/tokenization_flaubert.py +++ b/src/transformers/models/flaubert/tokenization_flaubert.py @@ -70,16 +70,6 @@ } -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError(f"Unsupported string type: {type(text)}") - - def convert_to_unicode(text): """ Converts `text` to Unicode (if it's not already), assuming UTF-8 input.