/
knbc.py
188 lines (143 loc) · 5.47 KB
/
knbc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2021 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
import re
from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
from nltk.corpus.reader.util import (
FileSystemPathPointer,
find_corpus_fileids,
read_blankline_block,
)
from nltk.parse import DependencyGraph
# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
class KNBCorpusReader(SyntaxCorpusReader):
"""
This class implements:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
The structure of tagged words:
tagged_word = (word(str), tags(tuple))
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
Usage example
>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
... 'knbc/corpus1',
... KNBCorpusReader,
... r'.*/KN.*',
... encoding='euc-jp',
... )
>>> len(knbc.sents()[0])
9
"""
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
def _read_block(self, stream):
# blocks are split by blankline (or EOF) - default
return read_blankline_block(stream)
def _word(self, t):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
res.append(cells[0])
return res
# ignores tagset argument
def _tag(self, t, tagset=None):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
res.append((cells[0], " ".join(cells[1:])))
return res
def _parse(self, t):
dg = DependencyGraph()
i = 0
for line in t.splitlines():
if line[0] in "*+":
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
assert m is not None
node = dg.nodes[i]
node.update({"address": i, "rel": m.group(2), "word": []})
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
dg.nodes[dep_parent]["deps"].append(i)
i += 1
elif line[0] != "#":
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
morph = cells[0], " ".join(cells[1:])
dg.nodes[i - 1]["word"].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
node["word"] = self.morphs2str(node["word"])
return dg.tree()
######################################################################
# Demo
######################################################################
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find("corpora/knbc/corpus1")
fileids = [
f
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
]
def _knbc_fileids_sort(x):
cells = x.split("-")
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader(
"knbc/corpus1",
KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort),
encoding="euc-jp",
)
print(knbc.fileids()[:10])
print("".join(knbc.words()[:100]))
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
knbc.morphs2str = lambda morphs: "/".join(
"{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
).encode("utf-8")
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
print(
"\n".join(
" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
)
assert isinstance(knbc.words()[0], str)
assert isinstance(knbc.sents()[0][0], str)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
if __name__ == "__main__":
demo()