Merge pull request #2291 from alvations/better-panlex

Added an improved panlex swadesh reader
nltk · Jul 4, 2019 · f6a4f38 · f6a4f38
2 parents 8c75c56 + ba70dbf
commit f6a4f38
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 2 deletions.
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
@@ -300,10 +300,10 @@
     'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
 )
 swadesh110 = LazyCorpusLoader(
-    'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
+    'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
 )
 swadesh207 = LazyCorpusLoader(
-    'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
+    'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
 )
 switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
 timit = LazyCorpusLoader('timit', TimitCorpusReader)

diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
@@ -103,6 +103,7 @@
 from nltk.corpus.reader.categorized_sents import *
 from nltk.corpus.reader.comparative_sents import *
 from nltk.corpus.reader.panlex_lite import *
+from nltk.corpus.reader.panlex_swadesh import *
 
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -178,4 +179,5 @@
     'NonbreakingPrefixesCorpusReader',
     'UnicharsCorpusReader',
     'MWAPPDBCorpusReader',
+    'PanlexSwadeshCorpusReader',
 ]
diff --git a/nltk/corpus/reader/panlex_swadesh.py b/nltk/corpus/reader/panlex_swadesh.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Word List Corpus Reader
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+from __future__ import print_function
+from collections import namedtuple, defaultdict
+import re
+from six import string_types
+
+
+from nltk.tokenize import line_tokenize
+
+from nltk.corpus.reader.wordlist import WordListCorpusReader
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+PanlexLanguage = namedtuple('PanlexLanguage',
+                          ['panlex_uid',  # (1) PanLex UID
+                           'iso639',      # (2) ISO 639 language code
+                           'iso639_type', # (3) ISO 639 language type, see README
+                           'script',      # (4) normal scripts of expressions
+                           'name',        # (5) PanLex default name
+                           'langvar_uid'  # (6) UID of the language variety in which the default name is an expression
+                           ])
+
+class PanlexSwadeshCorpusReader(WordListCorpusReader):
+    """
+    This is a class to read the PanLex Swadesh list from
+
+    David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
+    PanLex: Building a Resource for Panlingual Lexical Translation.
+    In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
+
+    License: CC0 1.0 Universal
+    https://creativecommons.org/publicdomain/zero/1.0/legalcode
+    """
+    def __init__(self, *args, **kwargs):
+        super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
+        # Find the swadesh size using the fileids' path.
+        self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
+        self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
+        self._macro_langauges = self.get_macrolanguages()
+
+    def license(self):
+        print('CC0 1.0 Universal')
+
+    def readme(self):
+        print(self.raw('README'))
+
+    def language_codes(self):
+        return self._languages.keys()
+
+    def get_languages(self):
+        for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
+            if not line.strip(): # Skip empty lines.
+                continue
+            yield PanlexLanguage(*line.strip().split('\t'))
+
+    def get_macrolanguages(self):
+        macro_langauges = defaultdict(list)
+        for lang in self._languages.values():
+            macro_langauges[lang.iso639].append(lang.panlex_uid)
+        return macro_langauges
+
+    def words_by_lang(self, lang_code):
+        """
+        :return: a list of list(str)
+        """
+        fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
+        return [concept.split('\t') for concept in self.words(fileid)]
+
+    def words_by_iso639(self, iso63_code):
+        """
+        :return: a list of list(str)
+        """
+        fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
+                   for lang_code in self._macro_langauges[iso63_code]]
+        return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
+
+    def entries(self, fileids=None):
+        """
+        :return: a tuple of words for the specified fileids.
+        """
+        if not fileids:
+            fileids = self.fileids()
+
+        wordlists = [self.words(f) for f in fileids]
+        return list(zip(*wordlists))