Skip to content

Commit

Permalink
Merge pull request #2291 from alvations/better-panlex
Browse files Browse the repository at this point in the history
Added an improved panlex swadesh reader
  • Loading branch information
stevenbird committed Jul 4, 2019
2 parents 8c75c56 + ba70dbf commit f6a4f38
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 2 deletions.
4 changes: 2 additions & 2 deletions nltk/corpus/__init__.py
Expand Up @@ -300,10 +300,10 @@
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
)
swadesh110 = LazyCorpusLoader(
'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
)
swadesh207 = LazyCorpusLoader(
'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
)
switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
timit = LazyCorpusLoader('timit', TimitCorpusReader)
Expand Down
2 changes: 2 additions & 0 deletions nltk/corpus/reader/__init__.py
Expand Up @@ -103,6 +103,7 @@
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
from nltk.corpus.reader.panlex_swadesh import *

# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
Expand Down Expand Up @@ -178,4 +179,5 @@
'NonbreakingPrefixesCorpusReader',
'UnicharsCorpusReader',
'MWAPPDBCorpusReader',
'PanlexSwadeshCorpusReader',
]
94 changes: 94 additions & 0 deletions nltk/corpus/reader/panlex_swadesh.py
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT


from __future__ import print_function
from collections import namedtuple, defaultdict
import re
from six import string_types


from nltk.tokenize import line_tokenize

from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

PanlexLanguage = namedtuple('PanlexLanguage',
['panlex_uid', # (1) PanLex UID
'iso639', # (2) ISO 639 language code
'iso639_type', # (3) ISO 639 language type, see README
'script', # (4) normal scripts of expressions
'name', # (5) PanLex default name
'langvar_uid' # (6) UID of the language variety in which the default name is an expression
])

class PanlexSwadeshCorpusReader(WordListCorpusReader):
"""
This is a class to read the PanLex Swadesh list from
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
PanLex: Building a Resource for Panlingual Lexical Translation.
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
License: CC0 1.0 Universal
https://creativecommons.org/publicdomain/zero/1.0/legalcode
"""
def __init__(self, *args, **kwargs):
super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
# Find the swadesh size using the fileids' path.
self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
self._macro_langauges = self.get_macrolanguages()

def license(self):
print('CC0 1.0 Universal')

def readme(self):
print(self.raw('README'))

def language_codes(self):
return self._languages.keys()

def get_languages(self):
for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
if not line.strip(): # Skip empty lines.
continue
yield PanlexLanguage(*line.strip().split('\t'))

def get_macrolanguages(self):
macro_langauges = defaultdict(list)
for lang in self._languages.values():
macro_langauges[lang.iso639].append(lang.panlex_uid)
return macro_langauges

def words_by_lang(self, lang_code):
"""
:return: a list of list(str)
"""
fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
return [concept.split('\t') for concept in self.words(fileid)]

def words_by_iso639(self, iso63_code):
"""
:return: a list of list(str)
"""
fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
for lang_code in self._macro_langauges[iso63_code]]
return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]

def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()

wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))

0 comments on commit f6a4f38

Please sign in to comment.