/
panlex_lite.py
174 lines (141 loc) · 5.14 KB
/
panlex_lite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Natural Language Toolkit: PanLex Corpus Reader
#
# Copyright (C) 2001-2021 NLTK Project
# Author: David Kamholz <kamholz@panlex.org>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
as an SQLite database. See the README.txt in the panlex_lite corpus directory
for more information on PanLex Lite.
"""
import os
import sqlite3
from nltk.corpus.reader.api import CorpusReader
class PanLexLiteCorpusReader(CorpusReader):
MEANING_Q = """
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
ORDER BY dnx2.uq DESC
"""
TRANSLATION_Q = """
SELECT s.tt, sum(s.uq) AS trq FROM (
SELECT ex2.tt, max(dnx.uq) AS uq
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
GROUP BY ex2.tt, dnx.ui
) s
GROUP BY s.tt
ORDER BY trq DESC, s.tt
"""
def __init__(self, root):
self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
self._uid_lv = {}
self._lv_uid = {}
for row in self._c.execute("SELECT uid, lv FROM lv"):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
def language_varieties(self, lc=None):
"""
Return a list of PanLex language varieties.
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
by this code. If unspecified, all varieties are returned.
:return: the specified language varieties as a list of tuples. The first
element is the language variety's seven-character uniform identifier,
and the second element is its default name.
:rtype: list(tuple)
"""
if lc is None:
return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
else:
return self._c.execute(
"SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
).fetchall()
def meanings(self, expr_uid, expr_tt):
"""
Return a list of meanings for an expression.
:param expr_uid: the expression's language variety, as a seven-character
uniform identifier.
:param expr_tt: the expression's text.
:return: a list of Meaning objects.
:rtype: list(Meaning)
"""
expr_lv = self._uid_lv[expr_uid]
mn_info = {}
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
mn = i[0]
uid = self._lv_uid[i[5]]
if not mn in mn_info:
mn_info[mn] = {
"uq": i[1],
"ap": i[2],
"ui": i[3],
"ex": {expr_uid: [expr_tt]},
}
if not uid in mn_info[mn]["ex"]:
mn_info[mn]["ex"][uid] = []
mn_info[mn]["ex"][uid].append(i[4])
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
def translations(self, from_uid, from_tt, to_uid):
"""
Return a list of translations for an expression into a single language
variety.
:param from_uid: the source expression's language variety, as a
seven-character uniform identifier.
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
:return: a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
from_lv = self._uid_lv[from_uid]
to_lv = self._uid_lv[to_uid]
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
class Meaning(dict):
"""
Represents a single PanLex meaning. A meaning is a translation set derived
from a single source.
"""
def __init__(self, mn, attr):
super().__init__(**attr)
self["mn"] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
return self["mn"]
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
return self["uq"]
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
return self["ap"]
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
return self["ui"]
def expressions(self):
"""
:return: the meaning's expressions as a dictionary whose keys are language
variety uniform identifiers and whose values are lists of expression
texts.
:rtype: dict
"""
return self["ex"]