diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py index 89a15ebbaf..54b4d36433 100644 --- a/nltk/corpus/__init__.py +++ b/nltk/corpus/__init__.py @@ -300,10 +300,10 @@ 'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8' ) swadesh110 = LazyCorpusLoader( - 'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8' + 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8' ) swadesh207 = LazyCorpusLoader( - 'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8' + 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8' ) switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj') timit = LazyCorpusLoader('timit', TimitCorpusReader) diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py index 19c151585e..f8c9585c52 100644 --- a/nltk/corpus/reader/__init__.py +++ b/nltk/corpus/reader/__init__.py @@ -103,6 +103,7 @@ from nltk.corpus.reader.categorized_sents import * from nltk.corpus.reader.comparative_sents import * from nltk.corpus.reader.panlex_lite import * +from nltk.corpus.reader.panlex_swadesh import * # Make sure that nltk.corpus.reader.bracket_parse gives the module, not # the function bracket_parse() defined in nltk.tree: @@ -178,4 +179,5 @@ 'NonbreakingPrefixesCorpusReader', 'UnicharsCorpusReader', 'MWAPPDBCorpusReader', + 'PanlexSwadeshCorpusReader', ] diff --git a/nltk/corpus/reader/panlex_swadesh.py b/nltk/corpus/reader/panlex_swadesh.py new file mode 100644 index 0000000000..221d3173a2 --- /dev/null +++ b/nltk/corpus/reader/panlex_swadesh.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +# Natural Language Toolkit: Word List Corpus Reader +# +# Copyright (C) 2001-2019 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + + +from __future__ import print_function +from collections import namedtuple, defaultdict +import re +from six import string_types + + +from nltk.tokenize import line_tokenize + +from nltk.corpus.reader.wordlist import WordListCorpusReader +from nltk.corpus.reader.util import * +from nltk.corpus.reader.api import * + +PanlexLanguage = namedtuple('PanlexLanguage', + ['panlex_uid', # (1) PanLex UID + 'iso639', # (2) ISO 639 language code + 'iso639_type', # (3) ISO 639 language type, see README + 'script', # (4) normal scripts of expressions + 'name', # (5) PanLex default name + 'langvar_uid' # (6) UID of the language variety in which the default name is an expression + ]) + +class PanlexSwadeshCorpusReader(WordListCorpusReader): + """ + This is a class to read the PanLex Swadesh list from + + David Kamholz, Jonathan Pool, and Susan M. Colowick (2014). + PanLex: Building a Resource for Panlingual Lexical Translation. + In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf + + License: CC0 1.0 Universal + https://creativecommons.org/publicdomain/zero/1.0/legalcode + """ + def __init__(self, *args, **kwargs): + super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs) + # Find the swadesh size using the fileids' path. + self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1) + self._languages = {lang.panlex_uid:lang for lang in self.get_languages()} + self._macro_langauges = self.get_macrolanguages() + + def license(self): + print('CC0 1.0 Universal') + + def readme(self): + print(self.raw('README')) + + def language_codes(self): + return self._languages.keys() + + def get_languages(self): + for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'): + if not line.strip(): # Skip empty lines. + continue + yield PanlexLanguage(*line.strip().split('\t')) + + def get_macrolanguages(self): + macro_langauges = defaultdict(list) + for lang in self._languages.values(): + macro_langauges[lang.iso639].append(lang.panlex_uid) + return macro_langauges + + def words_by_lang(self, lang_code): + """ + :return: a list of list(str) + """ + fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code) + return [concept.split('\t') for concept in self.words(fileid)] + + def words_by_iso639(self, iso63_code): + """ + :return: a list of list(str) + """ + fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code) + for lang_code in self._macro_langauges[iso63_code]] + return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)] + + def entries(self, fileids=None): + """ + :return: a tuple of words for the specified fileids. + """ + if not fileids: + fileids = self.fileids() + + wordlists = [self.words(f) for f in fileids] + return list(zip(*wordlists))