Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added an improved panlex swadesh reader #2291

Merged
merged 4 commits into from Jul 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions nltk/corpus/__init__.py
Expand Up @@ -300,10 +300,10 @@
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
)
swadesh110 = LazyCorpusLoader(
'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
)
swadesh207 = LazyCorpusLoader(
'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
)
switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
timit = LazyCorpusLoader('timit', TimitCorpusReader)
Expand Down
2 changes: 2 additions & 0 deletions nltk/corpus/reader/__init__.py
Expand Up @@ -103,6 +103,7 @@
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
from nltk.corpus.reader.panlex_swadesh import *

# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
Expand Down Expand Up @@ -178,4 +179,5 @@
'NonbreakingPrefixesCorpusReader',
'UnicharsCorpusReader',
'MWAPPDBCorpusReader',
'PanlexSwadeshCorpusReader',
]
94 changes: 94 additions & 0 deletions nltk/corpus/reader/panlex_swadesh.py
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT


from __future__ import print_function
from collections import namedtuple, defaultdict
import re
from six import string_types


from nltk.tokenize import line_tokenize

from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

PanlexLanguage = namedtuple('PanlexLanguage',
['panlex_uid', # (1) PanLex UID
'iso639', # (2) ISO 639 language code
'iso639_type', # (3) ISO 639 language type, see README
'script', # (4) normal scripts of expressions
'name', # (5) PanLex default name
'langvar_uid' # (6) UID of the language variety in which the default name is an expression
])

class PanlexSwadeshCorpusReader(WordListCorpusReader):
"""
This is a class to read the PanLex Swadesh list from

David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
PanLex: Building a Resource for Panlingual Lexical Translation.
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf

License: CC0 1.0 Universal
https://creativecommons.org/publicdomain/zero/1.0/legalcode
"""
def __init__(self, *args, **kwargs):
super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
# Find the swadesh size using the fileids' path.
self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
self._macro_langauges = self.get_macrolanguages()

def license(self):
print('CC0 1.0 Universal')

def readme(self):
print(self.raw('README'))

def language_codes(self):
return self._languages.keys()

def get_languages(self):
for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
if not line.strip(): # Skip empty lines.
continue
yield PanlexLanguage(*line.strip().split('\t'))

def get_macrolanguages(self):
macro_langauges = defaultdict(list)
for lang in self._languages.values():
macro_langauges[lang.iso639].append(lang.panlex_uid)
return macro_langauges

def words_by_lang(self, lang_code):
"""
:return: a list of list(str)
"""
fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
return [concept.split('\t') for concept in self.words(fileid)]

def words_by_iso639(self, iso63_code):
"""
:return: a list of list(str)
"""
fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
for lang_code in self._macro_langauges[iso63_code]]
return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]

def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()

wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))