From 59ce18bd616bbaac378dd789659d131b47664746 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 13 Jul 2022 17:50:26 +0200 Subject: [PATCH 1/8] NLLB tokenizer --- README.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/index.mdx | 1 + docs/source/en/model_doc/nllb.mdx | 23 + src/transformers/__init__.py | 5 + src/transformers/convert_slow_tokenizer.py | 32 ++ src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 1 + .../models/auto/tokenization_auto.py | 7 + src/transformers/models/nllb/__init__.py | 68 +++ .../models/nllb/tokenization_nllb.py | 361 ++++++++++++++ .../models/nllb/tokenization_nllb_fast.py | 298 ++++++++++++ .../utils/dummy_sentencepiece_objects.py | 7 + .../utils/dummy_tokenizers_objects.py | 7 + tests/models/nllb/__init__.py | 0 tests/models/nllb/test_tokenization_nllb.py | 442 ++++++++++++++++++ 18 files changed, 1257 insertions(+) create mode 100644 docs/source/en/model_doc/nllb.mdx create mode 100644 src/transformers/models/nllb/__init__.py create mode 100644 src/transformers/models/nllb/tokenization_nllb.py create mode 100644 src/transformers/models/nllb/tokenization_nllb_fast.py create mode 100644 tests/models/nllb/__init__.py create mode 100644 tests/models/nllb/test_tokenization_nllb.py diff --git a/README.md b/README.md index ea86f0ffe2c96..7cb10a0da9308 100644 --- a/README.md +++ b/README.md @@ -304,6 +304,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/README_ko.md b/README_ko.md index 2ea16ec787e4b..1fd4bda835249 100644 --- a/README_ko.md +++ b/README_ko.md @@ -285,6 +285,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/README_zh-hans.md b/README_zh-hans.md index 67df416851d9b..2755dcfbd7887 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -309,6 +309,7 @@ conda install -c huggingface transformers 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。 +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 8e220df645ec9..8d5c425a095d9 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -321,6 +321,7 @@ conda install -c huggingface transformers 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 1e8184facdead..e32e3e9b1be0a 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -127,6 +127,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. +1. **[NLLB](model_doc/nllb)** (from ) released with the paper []() by . 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx new file mode 100644 index 0000000000000..81a0e30ee6627 --- /dev/null +++ b/docs/source/en/model_doc/nllb.mdx @@ -0,0 +1,23 @@ + + +# NLLB + +## NllbTokenizer + +[[autodoc]] NllbTokenizer + - as_target_tokenizer + - build_inputs_with_special_tokens + +## NllbTokenizerFast + +[[autodoc]] NllbTokenizerFast diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6806de0af032b..d410b6990c36e 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -266,6 +266,7 @@ "models.mt5": ["MT5Config"], "models.mvp": ["MvpConfig", "MvpTokenizer"], "models.nezha": ["NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP", "NezhaConfig"], + "models.nllb": [], "models.nystromformer": [ "NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "NystromformerConfig", @@ -479,6 +480,7 @@ _import_structure["models.m2m_100"].append("M2M100Tokenizer") _import_structure["models.marian"].append("MarianTokenizer") _import_structure["models.mbart"].append("MBartTokenizer") + _import_structure["models.nllb"].append("NllbTokenizer") _import_structure["models.mbart50"].append("MBart50Tokenizer") _import_structure["models.mluke"].append("MLukeTokenizer") _import_structure["models.mt5"].append("MT5Tokenizer") @@ -543,6 +545,7 @@ _import_structure["models.mpnet"].append("MPNetTokenizerFast") _import_structure["models.mt5"].append("MT5TokenizerFast") _import_structure["models.mvp"].append("MvpTokenizerFast") + _import_structure["models.nllb"].append("NllbTokenizerFast") _import_structure["models.openai"].append("OpenAIGPTTokenizerFast") _import_structure["models.pegasus"].append("PegasusTokenizerFast") _import_structure["models.realm"].append("RealmTokenizerFast") @@ -3178,6 +3181,7 @@ from .models.mbart import MBart50Tokenizer, MBartTokenizer from .models.mluke import MLukeTokenizer from .models.mt5 import MT5Tokenizer + from .models.nllb import NllbTokenizer from .models.pegasus import PegasusTokenizer from .models.plbart import PLBartTokenizer from .models.reformer import ReformerTokenizer @@ -3232,6 +3236,7 @@ from .models.mpnet import MPNetTokenizerFast from .models.mt5 import MT5TokenizerFast from .models.mvp import MvpTokenizerFast + from .models.nllb import NllbTokenizerFast from .models.openai import OpenAIGPTTokenizerFast from .models.pegasus import PegasusTokenizerFast from .models.realm import RealmTokenizerFast diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 90598547999e6..427ce3516591c 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -694,6 +694,37 @@ def post_processor(self): ) +class NllbConverter(SpmConverter): + def vocab(self, proto): + vocab = [ + ("", 0.0), + ("", 0.0), + ("", 0.0), + ("", 0.0), + ] + vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] + vocab += [ + # fmt: off + ('ace_Arab', 0.0), ('ace_Latn', 0.0), ('acm_Arab', 0.0), ('acq_Arab', 0.0), ('aeb_Arab', 0.0), ('afr_Latn', 0.0), ('ajp_Arab', 0.0), ('aka_Latn', 0.0), ('amh_Ethi', 0.0), ('apc_Arab', 0.0), ('arb_Arab', 0.0), ('ars_Arab', 0.0), ('ary_Arab', 0.0), ('arz_Arab', 0.0), ('asm_Beng', 0.0), ('ast_Latn', 0.0), ('awa_Deva', 0.0), ('ayr_Latn', 0.0), ('azb_Arab', 0.0), ('azj_Latn', 0.0), ('bak_Cyrl', 0.0), ('bam_Latn', 0.0), ('ban_Latn', 0.0), ('bel_Cyrl', 0.0), ('bem_Latn', 0.0), ('ben_Beng', 0.0), ('bho_Deva', 0.0), ('bjn_Arab', 0.0), ('bjn_Latn', 0.0), ('bod_Tibt', 0.0), ('bos_Latn', 0.0), ('bug_Latn', 0.0), ('bul_Cyrl', 0.0), ('cat_Latn', 0.0), ('ceb_Latn', 0.0), ('ces_Latn', 0.0), ('cjk_Latn', 0.0), ('ckb_Arab', 0.0), ('crh_Latn', 0.0), ('cym_Latn', 0.0), ('dan_Latn', 0.0), ('deu_Latn', 0.0), ('dik_Latn', 0.0), ('dyu_Latn', 0.0), ('dzo_Tibt', 0.0), ('ell_Grek', 0.0), ('eng_Latn', 0.0), ('epo_Latn', 0.0), ('est_Latn', 0.0), ('eus_Latn', 0.0), ('ewe_Latn', 0.0), ('fao_Latn', 0.0), ('pes_Arab', 0.0), ('fij_Latn', 0.0), ('fin_Latn', 0.0), ('fon_Latn', 0.0), ('fra_Latn', 0.0), ('fur_Latn', 0.0), ('fuv_Latn', 0.0), ('gla_Latn', 0.0), ('gle_Latn', 0.0), ('glg_Latn', 0.0), ('grn_Latn', 0.0), ('guj_Gujr', 0.0), ('hat_Latn', 0.0), ('hau_Latn', 0.0), ('heb_Hebr', 0.0), ('hin_Deva', 0.0), ('hne_Deva', 0.0), ('hrv_Latn', 0.0), ('hun_Latn', 0.0), ('hye_Armn', 0.0), ('ibo_Latn', 0.0), ('ilo_Latn', 0.0), ('ind_Latn', 0.0), ('isl_Latn', 0.0), ('ita_Latn', 0.0), ('jav_Latn', 0.0), ('jpn_Jpan', 0.0), ('kab_Latn', 0.0), ('kac_Latn', 0.0), ('kam_Latn', 0.0), ('kan_Knda', 0.0), ('kas_Arab', 0.0), ('kas_Deva', 0.0), ('kat_Geor', 0.0), ('knc_Arab', 0.0), ('knc_Latn', 0.0), ('kaz_Cyrl', 0.0), ('kbp_Latn', 0.0), ('kea_Latn', 0.0), ('khm_Khmr', 0.0), ('kik_Latn', 0.0), ('kin_Latn', 0.0), ('kir_Cyrl', 0.0), ('kmb_Latn', 0.0), ('kon_Latn', 0.0), ('kor_Hang', 0.0), ('kmr_Latn', 0.0), ('lao_Laoo', 0.0), ('lvs_Latn', 0.0), ('lij_Latn', 0.0), ('lim_Latn', 0.0), ('lin_Latn', 0.0), ('lit_Latn', 0.0), ('lmo_Latn', 0.0), ('ltg_Latn', 0.0), ('ltz_Latn', 0.0), ('lua_Latn', 0.0), ('lug_Latn', 0.0), ('luo_Latn', 0.0), ('lus_Latn', 0.0), ('mag_Deva', 0.0), ('mai_Deva', 0.0), ('mal_Mlym', 0.0), ('mar_Deva', 0.0), ('min_Latn', 0.0), ('mkd_Cyrl', 0.0), ('plt_Latn', 0.0), ('mlt_Latn', 0.0), ('mni_Beng', 0.0), ('khk_Cyrl', 0.0), ('mos_Latn', 0.0), ('mri_Latn', 0.0), ('zsm_Latn', 0.0), ('mya_Mymr', 0.0), ('nld_Latn', 0.0), ('nno_Latn', 0.0), ('nob_Latn', 0.0), ('npi_Deva', 0.0), ('nso_Latn', 0.0), ('nus_Latn', 0.0), ('nya_Latn', 0.0), ('oci_Latn', 0.0), ('gaz_Latn', 0.0), ('ory_Orya', 0.0), ('pag_Latn', 0.0), ('pan_Guru', 0.0), ('pap_Latn', 0.0), ('pol_Latn', 0.0), ('por_Latn', 0.0), ('prs_Arab', 0.0), ('pbt_Arab', 0.0), ('quy_Latn', 0.0), ('ron_Latn', 0.0), ('run_Latn', 0.0), ('rus_Cyrl', 0.0), ('sag_Latn', 0.0), ('san_Deva', 0.0), ('sat_Beng', 0.0), ('scn_Latn', 0.0), ('shn_Mymr', 0.0), ('sin_Sinh', 0.0), ('slk_Latn', 0.0), ('slv_Latn', 0.0), ('smo_Latn', 0.0), ('sna_Latn', 0.0), ('snd_Arab', 0.0), ('som_Latn', 0.0), ('sot_Latn', 0.0), ('spa_Latn', 0.0), ('als_Latn', 0.0), ('srd_Latn', 0.0), ('srp_Cyrl', 0.0), ('ssw_Latn', 0.0), ('sun_Latn', 0.0), ('swe_Latn', 0.0), ('swh_Latn', 0.0), ('szl_Latn', 0.0), ('tam_Taml', 0.0), ('tat_Cyrl', 0.0), ('tel_Telu', 0.0), ('tgk_Cyrl', 0.0), ('tgl_Latn', 0.0), ('tha_Thai', 0.0), ('tir_Ethi', 0.0), ('taq_Latn', 0.0), ('taq_Tfng', 0.0), ('tpi_Latn', 0.0), ('tsn_Latn', 0.0), ('tso_Latn', 0.0), ('tuk_Latn', 0.0), ('tum_Latn', 0.0), ('tur_Latn', 0.0), ('twi_Latn', 0.0), ('tzm_Tfng', 0.0), ('uig_Arab', 0.0), ('ukr_Cyrl', 0.0), ('umb_Latn', 0.0), ('urd_Arab', 0.0), ('uzn_Latn', 0.0), ('vec_Latn', 0.0), ('vie_Latn', 0.0), ('war_Latn', 0.0), ('wol_Latn', 0.0), ('xho_Latn', 0.0), ('ydd_Hebr', 0.0), ('yor_Latn', 0.0), ('yue_Hant', 0.0), ('zho_Hans', 0.0), ('zho_Hant', 0.0), ('zul_Latn', 0.0) + # fmt: on + ] + vocab += [("", 0.0)] + return vocab + + def unk_id(self, proto): + return 3 + + def post_processor(self): + return processors.TemplateProcessing( + single="eng_Latn $A ", + pair="eng_Latn $A $B ", + special_tokens=[ + ("eng_Latn", self.original_tokenizer.convert_tokens_to_ids("eng_Latn")), + ("", self.original_tokenizer.convert_tokens_to_ids("")), + ], + ) + + class XLMRobertaConverter(SpmConverter): def vocab(self, proto): vocab = [ @@ -1034,6 +1065,7 @@ def post_processor(self): "MPNetTokenizer": MPNetConverter, "MobileBertTokenizer": BertConverter, "MvpTokenizer": RobertaConverter, + "NllbTokenizer": NllbConverter, "OpenAIGPTTokenizer": OpenAIGPTConverter, "PegasusTokenizer": PegasusConverter, "RealmTokenizer": BertConverter, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index c4b48e6cec658..b6fe28a33006e 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -97,6 +97,7 @@ mt5, mvp, nezha, + nllb, nystromformer, openai, opt, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 4e32b510b0c1e..51e8c15c11a1c 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -342,6 +342,7 @@ ("mt5", "MT5"), ("mvp", "MVP"), ("nezha", "Nezha"), + ("nllb", "NLLB"), ("nystromformer", "Nyströmformer"), ("openai-gpt", "OpenAI GPT"), ("opt", "OPT"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index d2f3477616958..c614d7e5c8f19 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -177,6 +177,13 @@ ), ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)), ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ( + "nllb", + ( + "NllbTokenizer" if is_sentencepiece_available() else None, + "NllbTokenizerFast" if is_tokenizers_available() else None, + ), + ), ( "nystromformer", ( diff --git a/src/transformers/models/nllb/__init__.py b/src/transformers/models/nllb/__init__.py new file mode 100644 index 0000000000000..a678bf527440f --- /dev/null +++ b/src/transformers/models/nllb/__init__.py @@ -0,0 +1,68 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_sentencepiece_available, + is_tokenizers_available, + is_torch_available, +) + + +_import_structure = {} + +try: + if not is_sentencepiece_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_nllb"] = ["NllbTokenizer"] + +try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_nllb_fast"] = ["NllbTokenizerFast"] + + +if TYPE_CHECKING: + try: + if not is_sentencepiece_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_nllb import NllbTokenizer + + try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_nllb_fast import NllbTokenizerFast + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py new file mode 100644 index 0000000000000..1766bb1cf9e00 --- /dev/null +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -0,0 +1,361 @@ +# coding=utf-8 +# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from contextlib import contextmanager +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/nllb-200-distilled-600M": ( + "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/sentencepiece.bpe.model" + ), + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/nllb-200-distilled-600M": 1024, +} + +# fmt: off +FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn'] +# fmt: on + + +class NllbTokenizer(PreTrainedTokenizer): + """ + Construct an NLLB tokenizer. + + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + The tokenization method is ` ` for source language documents, and `` + ``` for target language documents. + + Examples: + + ```python + >>> from transformers import NllbTokenizer + + >>> tokenizer = NllbTokenizer.from_pretrained( + ... "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn" + ... ) + >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" + >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." + >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") + >>> with tokenizer.as_target_tokenizer(): + ... labels = tokenizer(expected_translation_french, return_tensors="pt") + >>> inputs["labels"] = labels["input_ids"] + ```""" + + vocab_files_names = VOCAB_FILES_NAMES + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + + prefix_tokens: List[int] = [] + suffix_tokens: List[int] = [] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + tokenizer_file=None, + src_lang=None, + tgt_lang=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + additional_special_tokens=None, + **kwargs + ): + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + tokenizer_file=None, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # Mimic fairseq token-to-id alignment for the first 4 token + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + + # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + + self.sp_model_size = len(self.sp_model) + self.lang_code_to_id = { + code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES) + } + self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()} + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + + self.fairseq_tokens_to_ids.update(self.lang_code_to_id) + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + self._additional_special_tokens = list(self.lang_code_to_id.keys()) + + if additional_special_tokens is not None: + # Only add those special tokens if they are not already there. + self._additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in self._additional_special_tokens] + ) + + self._src_lang = src_lang if src_lang is not None else "eng_Latn" + self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] + self.tgt_lang = tgt_lang + self.set_src_lang_special_tokens(self._src_lang) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + @property + def vocab_size(self): + return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 # Plus 1 for the mask token + + @property + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `X [eos, src_lang_code]` + - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def _build_translation_inputs( + self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs + ): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def prepare_seq2seq_batch( + self, + src_texts: List[str], + src_lang: str = "eng_Latn", + tgt_texts: Optional[List[str]] = None, + tgt_lang: str = "fra_Latn", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + @contextmanager + def as_target_tokenizer(self): + """ + Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to + sequence-to-sequence models that need a slightly different processing for the labels. + """ + self.set_tgt_lang_special_tokens(self.tgt_lang) + yield + self.set_src_lang_special_tokens(self.src_lang) + + def set_src_lang_special_tokens(self, src_lang) -> None: + """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" + self.cur_lang_code = self.lang_code_to_id[src_lang] + self.prefix_tokens = [] + self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] + + def set_tgt_lang_special_tokens(self, lang: str) -> None: + """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].""" + self.cur_lang_code = self.lang_code_to_id[lang] + self.prefix_tokens = [] + self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py new file mode 100644 index 0000000000000..a71b89bb24f6c --- /dev/null +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -0,0 +1,298 @@ +# coding=utf-8 +# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from contextlib import contextmanager +from shutil import copyfile +from typing import List, Optional, Tuple + +from tokenizers import processors + +from ...tokenization_utils import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_nllb import NllbTokenizer +else: + NllbTokenizer = None + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/nllb-200-distilled-600M": ( + "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/sentencepiece.bpe.model" + ), + }, + "tokenizer_file": { + "facebook/nllb-200-distilled-600M": ( + "https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/tokenizer.json" + ), + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/nllb-large-en-ro": 1024, + "facebook/nllb-200-distilled-600M": 1024, +} + +# fmt: off +FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn'] +# fmt: on + + +class NllbTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + The tokenization method is ` ` for source language documents, and `` + ``` for target language documents. + + Examples: + + ```python + >>> from transformers import NllbTokenizerFast + + >>> tokenizer = NllbTokenizerFast.from_pretrained( + ... "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn" + ... ) + >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" + >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." + >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") + >>> with tokenizer.as_target_tokenizer(): + ... labels = tokenizer(expected_translation_french, return_tensors="pt") + >>> inputs["labels"] = labels["input_ids"] + ```""" + + vocab_files_names = VOCAB_FILES_NAMES + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = NllbTokenizer + + prefix_tokens: List[int] = [] + suffix_tokens: List[int] = [] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + src_lang=None, + tgt_lang=None, + additional_special_tokens=None, + **kwargs + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file=vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + self.can_save_slow_tokenizer = False if not self.vocab_file else True + + _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy() + + if additional_special_tokens is not None: + # Only add those special tokens if they are not already there. + _additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in _additional_special_tokens] + ) + + self.add_special_tokens({"additional_special_tokens": _additional_special_tokens}) + self.lang_code_to_id = { + lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES + } + + self._src_lang = src_lang if src_lang is not None else "eng_Latn" + self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang) + self.tgt_lang = tgt_lang + self.set_src_lang_special_tokens(self._src_lang) + + @property + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. The special tokens depend on calling set_lang. + + An NLLB sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `X [eos, src_lang_code]` + - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def _build_translation_inputs( + self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs + ): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + def prepare_seq2seq_batch( + self, + src_texts: List[str], + src_lang: str = "eng_Latn", + tgt_texts: Optional[List[str]] = None, + tgt_lang: str = "fra_Latn", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + @contextmanager + def as_target_tokenizer(self): + """ + Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to + sequence-to-sequence models that need a slightly different processing for the labels. + """ + self.set_tgt_lang_special_tokens(self.tgt_lang) + yield + self.set_src_lang_special_tokens(self.src_lang) + + def set_src_lang_special_tokens(self, src_lang) -> None: + """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" + self.cur_lang_code = self.convert_tokens_to_ids(src_lang) + self.prefix_tokens = [] + self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + def set_tgt_lang_special_tokens(self, lang: str) -> None: + """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].""" + self.cur_lang_code = self.convert_tokens_to_ids(lang) + self.prefix_tokens = [] + self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory.") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py index 00989dc0d12a4..69f0bdcb7b1aa 100644 --- a/src/transformers/utils/dummy_sentencepiece_objects.py +++ b/src/transformers/utils/dummy_sentencepiece_objects.py @@ -115,6 +115,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["sentencepiece"]) +class NllbTokenizer(metaclass=DummyObject): + _backends = ["sentencepiece"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["sentencepiece"]) + + class PegasusTokenizer(metaclass=DummyObject): _backends = ["sentencepiece"] diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index 80f6e9c42e8eb..755be5c48ae52 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -269,6 +269,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tokenizers"]) +class NllbTokenizerFast(metaclass=DummyObject): + _backends = ["tokenizers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tokenizers"]) + + class OpenAIGPTTokenizerFast(metaclass=DummyObject): _backends = ["tokenizers"] diff --git a/tests/models/nllb/__init__.py b/tests/models/nllb/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py new file mode 100644 index 0000000000000..53123c17b3b6a --- /dev/null +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -0,0 +1,442 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +from transformers import ( + SPIECE_UNDERLINE, + AddedToken, + BatchEncoding, + NllbTokenizer, + NllbTokenizerFast, + is_torch_available, +) +from transformers.testing_utils import ( + get_tests_dir, + nested_simplify, + require_sentencepiece, + require_tokenizers, + require_torch, +) + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +if is_torch_available(): + from transformers.models.m2m_100.modeling_m2m_100 import shift_tokens_right + +EN_CODE = 256047 +RO_CODE = 256145 + + +@require_sentencepiece +@require_tokenizers +class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = NllbTokenizer + rust_tokenizer_class = NllbTokenizerFast + test_rust_tokenizer = True + test_sentencepiece = True + from_pretrained_kwargs = {"use_auth_token": True} + + def setUp(self): + super().setUp() + + # We have a SentencePiece fixture for testing + tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.save_pretrained(self.tmpdirname) + + def test_full_tokenizer(self): + tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), + [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]], + ) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "é", + ".", + ], + ) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, + [ + value + tokenizer.fairseq_offset + for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4] + # ^ unk: 2 + 1 = 3 unk: 2 + 1 = 3 ^ + ], + ) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual( + back_tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "", + ".", + ], + ) + + # overwrite from test_tokenization_common to speed up test + def test_save_pretrained(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + self.tokenizers_list[0] = ( + self.rust_tokenizer_class, + "hf-internal-testing/tiny-random-nllb", + {"use_auth_token": True}, + ) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + @require_torch + def test_prepare_seq2seq_batch(self): + if not self.test_seq2seq: + return + + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Longer text that will definitely require truncation. + src_text = [ + " UN Chief Says There Is No Military Solution in Syria", + " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for" + " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons" + " will only worsen the violence and misery for millions of people.", + ] + tgt_text = [ + "Şeful ONU declară că nu există o soluţie militară în Siria", + "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al" + ' Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi' + " că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.", + ] + try: + batch = tokenizer.prepare_seq2seq_batch( + src_texts=src_text, + tgt_texts=tgt_text, + max_length=3, + max_target_length=10, + return_tensors="pt", + src_lang="eng_Latn", + tgt_lang="ron_Latn", + ) + except NotImplementedError: + return + self.assertEqual(batch.input_ids.shape[1], 3) + self.assertEqual(batch.labels.shape[1], 10) + # max_target_length will default to max_length if not specified + batch = tokenizer.prepare_seq2seq_batch( + src_text, tgt_texts=tgt_text, max_length=3, return_tensors="pt" + ) + self.assertEqual(batch.input_ids.shape[1], 3) + self.assertEqual(batch.labels.shape[1], 3) + + batch_encoder_only = tokenizer.prepare_seq2seq_batch( + src_texts=src_text, max_length=3, max_target_length=10, return_tensors="pt" + ) + self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) + self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) + self.assertNotIn("decoder_input_ids", batch_encoder_only) + + @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.") + def test_save_slow_from_fast_and_reload_fast(self): + pass + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + pretrained_name, + additional_special_tokens=added_tokens, + **kwargs, # , from_slow=True <- unfortunately too slow to convert + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + + cr_output = tokenizer_cr.encode("Hey this is a token") + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + +@require_torch +@require_sentencepiece +@require_tokenizers +class NllbDistilledIntegrationTest(unittest.TestCase): + checkpoint_name = "facebook/nllb-200-distilled-600M" + src_text = [ + " UN Chief Says There Is No Military Solution in Syria", + """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""", + ] + tgt_text = [ + "Şeful ONU declară că nu există o soluţie militară în Siria", + "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei" + ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor' + " face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.", + ] + expected_src_tokens = [ + 16297, + 134408, + 8165, + 248066, + 14734, + 950, + 1135, + 105721, + 3573, + 83, + 27352, + 108, + 49486, + 2, + 256047, + ] + + @classmethod + def setUpClass(cls): + cls.tokenizer: NllbTokenizer = NllbTokenizer.from_pretrained( + cls.checkpoint_name, src_lang="eng_Latn", tgt_lang="ron_Latn", use_auth_token=True + ) + cls.pad_token_id = 1 + return cls + + def test_language_codes(self): + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Arab"], 256001) + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Latn"], 256002) + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["fra_Latn"], 256057) + + def test_enro_tokenizer_batch_encode_plus(self): + ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0] + self.assertListEqual(self.expected_src_tokens, ids) + + def test_enro_tokenizer_decode_ignores_language_codes(self): + self.assertIn(RO_CODE, self.tokenizer.all_special_ids) + # fmt: off + generated_ids = [RO_CODE, 4254, 98068, 112923, 39072, 3909, 713, 102767, 26, 17314, 35642, 14683, 33118, 2022, 66987, 2, 256047] + # fmt: on + + result = self.tokenizer.decode(generated_ids, skip_special_tokens=True) + expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True) + self.assertEqual(result, expected_romanian) + self.assertNotIn(self.tokenizer.eos_token, result) + + def test_enro_tokenizer_truncation(self): + src_text = ["this is gunna be a long sentence " * 20] + assert isinstance(src_text[0], str) + desired_max_length = 10 + ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0] + self.assertEqual(ids[-2], 2) + self.assertEqual(ids[-1], EN_CODE) + self.assertEqual(len(ids), desired_max_length) + + def test_mask_token(self): + self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["", "ar_AR"]), [256203, 3]) + + def test_special_tokens_unaffacted_by_save_load(self): + tmpdirname = tempfile.mkdtemp() + original_special_tokens = self.tokenizer.fairseq_tokens_to_ids + self.tokenizer.save_pretrained(tmpdirname) + new_tok = NllbTokenizer.from_pretrained(tmpdirname) + self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens) + + @require_torch + def test_enro_tokenizer_prepare_batch(self): + batch = self.tokenizer( + self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" + ) + with self.tokenizer.as_target_tokenizer(): + targets = self.tokenizer( + self.tgt_text, + padding=True, + truncation=True, + max_length=len(self.expected_src_tokens), + return_tensors="pt", + ) + labels = targets["input_ids"] + batch["decoder_input_ids"] = shift_tokens_right( + labels, self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"] + ) + + self.assertIsInstance(batch, BatchEncoding) + + self.assertEqual((2, 15), batch.input_ids.shape) + self.assertEqual((2, 15), batch.attention_mask.shape) + result = batch.input_ids.tolist()[0] + self.assertListEqual(self.expected_src_tokens, result) + self.assertEqual(2, batch.decoder_input_ids[0, -1]) # EOS + # Test that special tokens are reset + self.assertEqual(self.tokenizer.prefix_tokens, []) + self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id, EN_CODE]) + + def test_seq2seq_max_length(self): + batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") + with self.tokenizer.as_target_tokenizer(): + targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") + labels = targets["input_ids"] + batch["decoder_input_ids"] = shift_tokens_right( + labels, + self.tokenizer.pad_token_id, + decoder_start_token_id=self.tokenizer.lang_code_to_id[self.tokenizer.tgt_lang], + ) + + self.assertEqual(batch.input_ids.shape[1], 3) + self.assertEqual(batch.decoder_input_ids.shape[1], 10) + + @require_torch + def test_tokenizer_translation(self): + inputs = self.tokenizer._build_translation_inputs( + "A test", return_tensors="pt", src_lang="eng_Latn", tgt_lang="fra_Latn" + ) + + self.assertEqual( + nested_simplify(inputs), + { + # A, test, EOS, en_XX + "input_ids": [[70, 7356, 2, 256047]], + "attention_mask": [[1, 1, 1, 1]], + # ar_AR + "forced_bos_token_id": 256057, + }, + ) From 0d85ea4030ce1b594c11c150d580a88063ad93c9 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 14 Jul 2022 02:07:22 -0400 Subject: [PATCH 2/8] Apply suggestions from code review - Thanks Stefan! Co-authored-by: Stefan Schweter --- src/transformers/models/nllb/tokenization_nllb.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index 1766bb1cf9e00..0735692680a94 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -125,10 +125,10 @@ def __init__( self.vocab_file = vocab_file # Original fairseq vocab and spm vocab must be "aligned": - # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 - # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- - # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' - # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ---- + # fairseq | '' | '' | '' | '' | 'an' | '▁n' | '▁m' | '▁t' | '▁k' | '▁a' + # spm | '' | '' | '' | 'an' | '▁n' | '▁m' | '▁t' | '▁k' | '▁a' | '▁s' # Mimic fairseq token-to-id alignment for the first 4 token self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} From 8ce67c4642c5ea5bf3f30db43514d687a381bb13 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 14 Jul 2022 09:36:03 +0200 Subject: [PATCH 3/8] Final touches --- README.md | 2 +- README_ko.md | 2 +- README_zh-hans.md | 2 +- README_zh-hant.md | 2 +- docs/source/en/_toctree.yml | 2 + docs/source/en/index.mdx | 2 +- docs/source/en/model_doc/nllb.mdx | 76 +++++++++++++++++++++ tests/models/nllb/test_tokenization_nllb.py | 6 +- 8 files changed, 86 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7cb10a0da9308..a90f19ae83b2d 100644 --- a/README.md +++ b/README.md @@ -304,7 +304,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/README_ko.md b/README_ko.md index 1fd4bda835249..97b5a0131326f 100644 --- a/README_ko.md +++ b/README_ko.md @@ -285,7 +285,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/README_zh-hans.md b/README_zh-hans.md index 2755dcfbd7887..e50b1b4e1f469 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -309,7 +309,7 @@ conda install -c huggingface transformers 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。 -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 8d5c425a095d9..86bc771655ec2 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -321,7 +321,7 @@ conda install -c huggingface transformers 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from ) released with the paper []() by . +1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a8cd1e35a5ae4..147eba98e25ae 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -321,6 +321,8 @@ - local: model_doc/nezha title: NEZHA - local: model_doc/nystromformer + title: NLLB + - local: model_doc/nllb title: Nyströmformer - local: model_doc/opt title: OPT diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index e32e3e9b1be0a..67de643e08287 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -127,7 +127,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](model_doc/nllb)** (from ) released with the paper []() by . +1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx index 81a0e30ee6627..48a0f1584d4bb 100644 --- a/docs/source/en/model_doc/nllb.mdx +++ b/docs/source/en/model_doc/nllb.mdx @@ -12,6 +12,82 @@ specific language governing permissions and limitations under the License. # NLLB +**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign +@LysandreJik + +## Overview of NLLB + +The NLLB model was presented in [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by Marta R. Costa-jussà, James Cross, Onur Çelebi, +Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, +Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, +Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, +Safiyyah Saleem, Holger Schwenk, and Jeff Wang. + +The abstract of the paper is the following: + +*Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today. +However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the +200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by +first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed +at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of +Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training +improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using +a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety. +Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system.* + +This implementation contains the dense models available on release. Let us know via a GitHub issue if you would like to see the MoE models as well. + +This model was contributed by [Lysandre](https://huggingface.co/lysandre). The Authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb). + +## Generating with NLLB + +While generating the target text set the `forced_bos_token_id` to the target language id. The following +example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model. + +Note that we're using the BCP-47 code for French `fra_Latn`. See [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) +for the list of all BCP-47 in the Flores 200 dataset. + +```python +>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") + +>>> article = "UN Chief says there is no military solution in Syria" +>>> inputs = tokenizer(article, return_tensors="pt") + +>>> translated_tokens = model.generate( +... **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=30 +... ) +>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +Le chef de l'ONU dit qu'il n'y a pas de solution militaire en Syrie +``` + +### Generating from any other language than English + +English (`eng_Latn`) is set as the default language from which to translate. In order to specify that you'd like to translate from a different language, +you should specify the BCP-47 code in the `src_lang` keyword argument of the tokenizer initialization. + +See example below for a translation from romanian to german: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained( +... "facebook/nllb-200-distilled-600M", use_auth_token=True, src_lang="ron_Latn" +... ) +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True) + +>>> article = "Şeful ONU spune că nu există o soluţie militară în Siria" +>>> inputs = tokenizer(article, return_tensors="pt") + +>>> translated_tokens = model.generate( +... **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30 +... ) +>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +UN-Chef sagt, es gibt keine militärische Lösung in Syrien +``` + ## NllbTokenizer [[autodoc]] NllbTokenizer diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 53123c17b3b6a..a6c087dcb0945 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -52,7 +52,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = NllbTokenizerFast test_rust_tokenizer = True test_sentencepiece = True - from_pretrained_kwargs = {"use_auth_token": True} + from_pretrained_kwargs = {} def setUp(self): super().setUp() @@ -146,7 +146,7 @@ def test_save_pretrained(self): self.tokenizers_list[0] = ( self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", - {"use_auth_token": True}, + {} ) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -337,7 +337,7 @@ class NllbDistilledIntegrationTest(unittest.TestCase): @classmethod def setUpClass(cls): cls.tokenizer: NllbTokenizer = NllbTokenizer.from_pretrained( - cls.checkpoint_name, src_lang="eng_Latn", tgt_lang="ron_Latn", use_auth_token=True + cls.checkpoint_name, src_lang="eng_Latn", tgt_lang="ron_Latn" ) cls.pad_token_id = 1 return cls From 33028f4c795e76f9e97226fc591bc7d0b8c7d815 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 14 Jul 2022 09:46:43 +0200 Subject: [PATCH 4/8] Style :) --- tests/models/nllb/test_tokenization_nllb.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index a6c087dcb0945..c721fc5a71276 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -143,11 +143,7 @@ def test_save_pretrained(self): # as we don't have a slow version, we can't compare the outputs between slow and fast versions return - self.tokenizers_list[0] = ( - self.rust_tokenizer_class, - "hf-internal-testing/tiny-random-nllb", - {} - ) + self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) From 2d89468175a56b432412eb2ff84b05500b5d80bb Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 18 Jul 2022 05:47:51 -0400 Subject: [PATCH 5/8] Update docs/source/en/model_doc/nllb.mdx Co-authored-by: Stefan Schweter --- docs/source/en/model_doc/nllb.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx index 48a0f1584d4bb..739209896da71 100644 --- a/docs/source/en/model_doc/nllb.mdx +++ b/docs/source/en/model_doc/nllb.mdx @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # NLLB -**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign +**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=bug&template=bug-report.yml) and assign @LysandreJik ## Overview of NLLB From 2e8f5b61ee8c68fbaee97187ade53940d436eca5 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 18 Jul 2022 05:49:29 -0400 Subject: [PATCH 6/8] Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- README.md | 2 +- docs/source/en/model_doc/nllb.mdx | 2 +- src/transformers/models/nllb/tokenization_nllb.py | 4 ++-- src/transformers/models/nllb/tokenization_nllb_fast.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a90f19ae83b2d..5ae5a8229c8a0 100644 --- a/README.md +++ b/README.md @@ -304,7 +304,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB](https://huggingface.co/docs/transformers/main/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx index 739209896da71..477ef1ca83ec4 100644 --- a/docs/source/en/model_doc/nllb.mdx +++ b/docs/source/en/model_doc/nllb.mdx @@ -37,7 +37,7 @@ Our model achieves an improvement of 44% BLEU relative to the previous state-of- This implementation contains the dense models available on release. Let us know via a GitHub issue if you would like to see the MoE models as well. -This model was contributed by [Lysandre](https://huggingface.co/lysandre). The Authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb). +This model was contributed by [Lysandre](https://huggingface.co/lysandre). The authors' code can be found [here](https://github.com/facebookresearch/fairseq/tree/nllb). ## Generating with NLLB diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index 0735692680a94..787f1eca2cc54 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -54,8 +54,8 @@ class NllbTokenizer(PreTrainedTokenizer): Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece). - The tokenization method is ` ` for source language documents, and `` - ``` for target language documents. + The tokenization method is ` ` for source language documents, and ` + ` for target language documents. Examples: diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index a71b89bb24f6c..528574a64ff44 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -67,8 +67,8 @@ class NllbTokenizerFast(PreTrainedTokenizerFast): This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. - The tokenization method is ` ` for source language documents, and `` - ``` for target language documents. + The tokenization method is ` ` for source language documents, and ` + ` for target language documents. Examples: From 9a156753dcbe53013b5d9512e68a13fe4eab0d8b Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 18 Jul 2022 12:12:37 +0200 Subject: [PATCH 7/8] PR reviews --- docs/source/en/_toctree.yml | 4 +- .../models/nllb/tokenization_nllb.py | 52 ++++++++++++++++++- .../models/nllb/tokenization_nllb_fast.py | 48 ++++++++++++++++- tests/models/nllb/test_tokenization_nllb.py | 7 --- 4 files changed, 99 insertions(+), 12 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 147eba98e25ae..7ca43eca35a13 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -320,9 +320,9 @@ title: MVP - local: model_doc/nezha title: NEZHA - - local: model_doc/nystromformer - title: NLLB - local: model_doc/nllb + title: NLLB + - local: model_doc/nystromformer title: Nyströmformer - local: model_doc/opt title: OPT diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index 787f1eca2cc54..ef0ee942bfa92 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -71,7 +71,55 @@ class NllbTokenizer(PreTrainedTokenizer): >>> with tokenizer.as_target_tokenizer(): ... labels = tokenizer(expected_translation_french, return_tensors="pt") >>> inputs["labels"] = labels["input_ids"] - ```""" + ``` + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + src_lang (`str`, *optional*): + The language to use as source language for translation. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + sp_model_kwargs (`Dict[str, str]`): + Additional keyword arguments to pass to the model initialization. + """ vocab_files_names = VOCAB_FILES_NAMES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -112,7 +160,7 @@ def __init__( cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, - tokenizer_file=None, + tokenizer_file=tokenizer_file, src_lang=src_lang, tgt_lang=tgt_lang, additional_special_tokens=additional_special_tokens, diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index 528574a64ff44..fa4eaa4c5a80f 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -84,7 +84,53 @@ class NllbTokenizerFast(PreTrainedTokenizerFast): >>> with tokenizer.as_target_tokenizer(): ... labels = tokenizer(expected_translation_french, return_tensors="pt") >>> inputs["labels"] = labels["input_ids"] - ```""" + ``` + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + src_lang (`str`, *optional*): + The language to use as source language for translation. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + """ vocab_files_names = VOCAB_FILES_NAMES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index c721fc5a71276..10575084a7278 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -105,7 +105,6 @@ def test_full_tokenizer(self): [ value + tokenizer.fairseq_offset for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4] - # ^ unk: 2 + 1 = 3 unk: 2 + 1 = 3 ^ ], ) @@ -139,10 +138,6 @@ def test_full_tokenizer(self): # overwrite from test_tokenization_common to speed up test def test_save_pretrained(self): - if not self.test_slow_tokenizer: - # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return - self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -166,8 +161,6 @@ def test_save_pretrained(self): # Check special tokens are set accordingly on Rust and Python for key in tokenizer_pp.special_tokens_map: self.assertTrue(hasattr(tokenizer_rp, key)) - # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) - # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) shutil.rmtree(tmpdirname2) From 4ec6cf1ed609e1830c74177fe5277c1064e8900e Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 18 Jul 2022 12:21:55 +0200 Subject: [PATCH 8/8] Auto models --- README_ko.md | 2 +- README_zh-hans.md | 2 +- README_zh-hant.md | 2 +- src/transformers/models/auto/modeling_auto.py | 3 +++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README_ko.md b/README_ko.md index 97b5a0131326f..ff953765ca5d5 100644 --- a/README_ko.md +++ b/README_ko.md @@ -285,7 +285,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB](https://huggingface.co/docs/transformers/main/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/README_zh-hans.md b/README_zh-hans.md index e50b1b4e1f469..998d90e378a42 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -309,7 +309,7 @@ conda install -c huggingface transformers 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。 -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。 +1. **[NLLB](https://huggingface.co/docs/transformers/main/model_doc/nllb)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 86bc771655ec2..66919dfe9cd21 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -321,7 +321,7 @@ conda install -c huggingface transformers 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[MVP](https://huggingface.co/docs/transformers/main/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. 1. **[Nezha](https://huggingface.co/docs/transformers/main/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu. -1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. +1. **[NLLB](https://huggingface.co/docs/transformers/main/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team. 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 51c63aaf5dd4a..3cd574dea1a20 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -94,6 +94,7 @@ ("mt5", "MT5Model"), ("mvp", "MvpModel"), ("nezha", "NezhaModel"), + ("nllb", "M2M100Model"), ("nystromformer", "NystromformerModel"), ("openai-gpt", "OpenAIGPTModel"), ("opt", "OPTModel"), @@ -234,6 +235,7 @@ ("mpnet", "MPNetForMaskedLM"), ("mvp", "MvpForConditionalGeneration"), ("nezha", "NezhaForMaskedLM"), + ("nllb", "M2M100ForConditionalGeneration"), ("nystromformer", "NystromformerForMaskedLM"), ("openai-gpt", "OpenAIGPTLMHeadModel"), ("plbart", "PLBartForConditionalGeneration"), @@ -447,6 +449,7 @@ ("mbart", "MBartForConditionalGeneration"), ("mt5", "MT5ForConditionalGeneration"), ("mvp", "MvpForConditionalGeneration"), + ("nllb", "M2M100ForConditionalGeneration"), ("pegasus", "PegasusForConditionalGeneration"), ("plbart", "PLBartForConditionalGeneration"), ("prophetnet", "ProphetNetForConditionalGeneration"),