Skip to content

Commit

Permalink
Update IndicGLUE download links (#4978)
Browse files Browse the repository at this point in the history
update IndicGLUE download links
  • Loading branch information
sumanthd17 committed Sep 15, 2022
1 parent 4c695fa commit 51aef08
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 28 deletions.
2 changes: 1 addition & 1 deletion datasets/indic_glue/dataset_infos.json

Large diffs are not rendered by default.

54 changes: 27 additions & 27 deletions datasets/indic_glue/indic_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,38 +327,38 @@
}

_DATA_URLS = {
"wnli": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/wnli-translated.tar.gz",
"copa": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/copa-translated.tar.gz",
"sna": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/soham-articles.tar.gz",
"csqa": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/wiki-cloze.tar.gz",
"wstp": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/wiki-section-titles.tar.gz",
"inltkh": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/inltk-headlines.tar.gz",
"bbca": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/bbc-articles.tar.gz",
"cvit-mkb-clsr": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/cvit-mkb.tar.gz",
"iitp-mr": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/iitp-movie-reviews.tar.gz",
"iitp-pr": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/iitp-product-reviews.tar.gz",
"actsa-sc": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/actsa.tar.gz",
"md": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/midas-discourse.tar.gz",
"wiki-ner": "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/evaluations/wikiann-ner.tar.gz",
"wnli": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/wnli-translated.tar.gz",
"copa": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/copa-translated.tar.gz",
"sna": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/soham-articles.tar.gz",
"csqa": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/wiki-cloze.tar.gz",
"wstp": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/wiki-section-titles.tar.gz",
"inltkh": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/inltk-headlines.tar.gz",
"bbca": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/bbc-articles.tar.gz",
"cvit-mkb-clsr": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/cvit-mkb.tar.gz",
"iitp-mr": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/iitp-movie-reviews.tar.gz",
"iitp-pr": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/iitp-product-reviews.tar.gz",
"actsa-sc": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/actsa.tar.gz",
"md": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/midas-discourse.tar.gz",
"wiki-ner": "https://ai4b-public-nlu-nlg.objectstore.e2enetworks.net/IndicGLUE/wikiann-ner.tar.gz",
}

_URLS = {
"wnli": "https://indicnlp.ai4bharat.org/indic-glue/#natural-language-inference",
"copa": "https://indicnlp.ai4bharat.org/indic-glue/#natural-language-inference",
"sna": "https://indicnlp.ai4bharat.org/indic-glue/#news-category-classification",
"csqa": "https://indicnlp.ai4bharat.org/indic-glue/#cloze-style-question-answering",
"wstp": "https://indicnlp.ai4bharat.org/indic-glue/#wikipedia-section-title-prediction",
"inltkh": "https://indicnlp.ai4bharat.org/indic-glue/#news-category-classification",
"bbca": "https://indicnlp.ai4bharat.org/indic-glue/#news-category-classification",
"cvit-mkb-clsr": "https://indicnlp.ai4bharat.org/indic-glue/#cross-lingual-sentence-retrieval",
"iitp-mr": "https://indicnlp.ai4bharat.org/indic-glue/#sentiment-analysis",
"iitp-pr": "https://indicnlp.ai4bharat.org/indic-glue/#sentiment-analysis",
"actsa-sc": "https://indicnlp.ai4bharat.org/indic-glue/#sentiment-analysis",
"md": "https://indicnlp.ai4bharat.org/indic-glue/#discourse-analysis",
"wiki-ner": "https://indicnlp.ai4bharat.org/indic-glue/#named-entity-recognition",
"wnli": "https://ai4bharat.iitm.ac.in/indic-glue",
"copa": "https://ai4bharat.iitm.ac.in/indic-glue",
"sna": "https://ai4bharat.iitm.ac.in/indic-glue",
"csqa": "https://ai4bharat.iitm.ac.in/indic-glue",
"wstp": "https://ai4bharat.iitm.ac.in/indic-glue",
"inltkh": "https://ai4bharat.iitm.ac.in/indic-glue",
"bbca": "https://ai4bharat.iitm.ac.in/indic-glue",
"cvit-mkb-clsr": "https://ai4bharat.iitm.ac.in/indic-glue",
"iitp-mr": "https://ai4bharat.iitm.ac.in/indic-glue",
"iitp-pr": "https://ai4bharat.iitm.ac.in/indic-glue",
"actsa-sc": "https://ai4bharat.iitm.ac.in/indic-glue",
"md": "https://ai4bharat.iitm.ac.in/indic-glue",
"wiki-ner": "https://ai4bharat.iitm.ac.in/indic-glue",
}

_INDIC_GLUE_URL = "https://indicnlp.ai4bharat.org/indic-glue/"
_INDIC_GLUE_URL = "https://ai4bharat.iitm.ac.in/indic-glue"

_WNLI_LANGS = ["en", "hi", "gu", "mr"]
_COPA_LANGS = ["en", "hi", "gu", "mr"]
Expand Down

1 comment on commit 51aef08

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007770 / 0.011353 (-0.003583) 0.003625 / 0.011008 (-0.007384) 0.028453 / 0.038508 (-0.010055) 0.029670 / 0.023109 (0.006561) 0.302590 / 0.275898 (0.026692) 0.363005 / 0.323480 (0.039525) 0.005495 / 0.007986 (-0.002491) 0.003032 / 0.004328 (-0.001297) 0.006585 / 0.004250 (0.002334) 0.043365 / 0.037052 (0.006313) 0.313925 / 0.258489 (0.055436) 0.358737 / 0.293841 (0.064896) 0.029095 / 0.128546 (-0.099452) 0.009258 / 0.075646 (-0.066389) 0.246461 / 0.419271 (-0.172811) 0.044850 / 0.043533 (0.001317) 0.304698 / 0.255139 (0.049559) 0.331705 / 0.283200 (0.048505) 0.088696 / 0.141683 (-0.052987) 1.502612 / 1.452155 (0.050458) 1.518572 / 1.492716 (0.025856)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.194211 / 0.018006 (0.176205) 0.427080 / 0.000490 (0.426590) 0.005402 / 0.000200 (0.005202) 0.000231 / 0.000054 (0.000177)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020885 / 0.037411 (-0.016526) 0.096596 / 0.014526 (0.082071) 0.101182 / 0.176557 (-0.075375) 0.141194 / 0.737135 (-0.595941) 0.106133 / 0.296338 (-0.190206)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.411518 / 0.215209 (0.196309) 4.109938 / 2.077655 (2.032283) 1.866509 / 1.504120 (0.362390) 1.655271 / 1.541195 (0.114076) 1.713996 / 1.468490 (0.245506) 0.456542 / 4.584777 (-4.128235) 3.625812 / 3.745712 (-0.119900) 2.777083 / 5.269862 (-2.492779) 1.475163 / 4.565676 (-3.090513) 0.053975 / 0.424275 (-0.370300) 0.011230 / 0.007607 (0.003623) 0.524778 / 0.226044 (0.298734) 5.269555 / 2.268929 (3.000626) 2.304892 / 55.444624 (-53.139732) 1.965777 / 6.876477 (-4.910700) 2.068635 / 2.142072 (-0.073437) 0.570270 / 4.805227 (-4.234957) 0.119333 / 6.500664 (-6.381331) 0.064606 / 0.075469 (-0.010863)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.506625 / 1.841788 (-0.335163) 12.774565 / 8.074308 (4.700257) 26.257094 / 10.191392 (16.065702) 0.851400 / 0.680424 (0.170976) 0.585187 / 0.534201 (0.050986) 0.344219 / 0.579283 (-0.235064) 0.392976 / 0.434364 (-0.041388) 0.234497 / 0.540337 (-0.305840) 0.235459 / 1.386936 (-1.151477)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.005791 / 0.011353 (-0.005561) 0.003661 / 0.011008 (-0.007348) 0.026851 / 0.038508 (-0.011657) 0.028405 / 0.023109 (0.005296) 0.418774 / 0.275898 (0.142876) 0.480340 / 0.323480 (0.156861) 0.003517 / 0.007986 (-0.004468) 0.003030 / 0.004328 (-0.001299) 0.004629 / 0.004250 (0.000379) 0.037846 / 0.037052 (0.000794) 0.423764 / 0.258489 (0.165275) 0.457590 / 0.293841 (0.163749) 0.027232 / 0.128546 (-0.101314) 0.009491 / 0.075646 (-0.066156) 0.248073 / 0.419271 (-0.171199) 0.056316 / 0.043533 (0.012783) 0.421584 / 0.255139 (0.166445) 0.442262 / 0.283200 (0.159062) 0.097208 / 0.141683 (-0.044475) 1.521707 / 1.452155 (0.069552) 1.540998 / 1.492716 (0.048281)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.176990 / 0.018006 (0.158984) 0.417520 / 0.000490 (0.417030) 0.004226 / 0.000200 (0.004026) 0.000092 / 0.000054 (0.000038)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020676 / 0.037411 (-0.016735) 0.093453 / 0.014526 (0.078927) 0.104752 / 0.176557 (-0.071804) 0.142835 / 0.737135 (-0.594301) 0.105162 / 0.296338 (-0.191177)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.469472 / 0.215209 (0.254263) 4.684869 / 2.077655 (2.607214) 2.428987 / 1.504120 (0.924867) 2.220125 / 1.541195 (0.678930) 2.279120 / 1.468490 (0.810630) 0.449468 / 4.584777 (-4.135309) 3.324650 / 3.745712 (-0.421062) 1.879953 / 5.269862 (-3.389909) 1.103622 / 4.565676 (-3.462054) 0.053497 / 0.424275 (-0.370778) 0.011611 / 0.007607 (0.004004) 0.573899 / 0.226044 (0.347855) 5.740063 / 2.268929 (3.471135) 2.869928 / 55.444624 (-52.574697) 2.514982 / 6.876477 (-4.361495) 2.638762 / 2.142072 (0.496690) 0.559307 / 4.805227 (-4.245920) 0.120443 / 6.500664 (-6.380221) 0.064659 / 0.075469 (-0.010810)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.563335 / 1.841788 (-0.278453) 12.828814 / 8.074308 (4.754506) 26.199293 / 10.191392 (16.007901) 0.925347 / 0.680424 (0.244923) 0.638495 / 0.534201 (0.104294) 0.345202 / 0.579283 (-0.234081) 0.401895 / 0.434364 (-0.032469) 0.236660 / 0.540337 (-0.303677) 0.242736 / 1.386936 (-1.144200)

CML watermark

Please sign in to comment.