google · tushuhei · May 12, 2022 · May 12, 2022 · May 12, 2022 · May 12, 2022
diff --git a/budoux/feature_extractor.py b/budoux/feature_extractor.py
@@ -14,13 +14,11 @@
 """Methods to encode source sentences to features."""
 
 import bisect
-import itertools
 import json
 import os
-import sys
 import typing
 
-from .utils import INVALID, SEP, Result
+from .utils import INVALID
 
 with open(os.path.join(os.path.dirname(__file__), 'unicode_blocks.json')) as f:
   block_starts: typing.List[int] = json.load(f)
@@ -113,40 +111,3 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
     if INVALID in value:
       del raw_feature[key]
   return [f'{item[0]}:{item[1]}' for item in raw_feature.items()]
-
-
-def process(source_filename: str, entries_filename: str) -> None:
-  """Extratcs features from source sentences and outputs trainig data entries.
-
-  Args:
-    source_filename (str): A file path to the source sentences.
-    entries_filename (str): A file path to the output entries.
-  """
-  with open(source_filename, encoding=sys.getdefaultencoding()) as f:
-    data = f.readlines()
-  with open(entries_filename, 'w', encoding=sys.getdefaultencoding()) as f:
-    f.write('')
-
-  for datum in data:
-    chunks = datum.strip().split(SEP)
-    chunk_lengths = [len(chunk) for chunk in chunks]
-    sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
-    sentence = ''.join(chunks)
-    p1 = Result.UNKNOWN.value
-    p2 = Result.UNKNOWN.value
-    p3 = Result.UNKNOWN.value
-    for i in range(1, len(sentence) + 1):
-      feature = get_feature(
-          sentence[i - 3] if i > 2 else INVALID,
-          sentence[i - 2] if i > 1 else INVALID, sentence[i - 1],
-          sentence[i] if i < len(sentence) else INVALID,
-          sentence[i + 1] if i + 1 < len(sentence) else INVALID,
-          sentence[i + 2] if i + 2 < len(sentence) else INVALID, p1, p2, p3)
-      positive = i in sep_indices
-      p = Result.POSITIVE.value if positive else Result.NEGATIVE.value
-      with open(entries_filename, 'a', encoding=sys.getdefaultencoding()) as f:
-        row = ['1' if positive else '-1'] + feature
-        f.write('\t'.join(row) + '\n')
-      p1 = p2
-      p2 = p3
-      p3 = p
diff --git a/scripts/context.py b/scripts/context.py
diff --git a/scripts/encode_data.py b/scripts/encode_data.py
@@ -14,8 +14,42 @@
 """Encodes the training data with extracted features."""
 
 import argparse
+import itertools
+import sys
 
-from context import feature_extractor
+from budoux import feature_extractor, utils
+
+
+def process(line: str, entries_filename: str) -> None:
+  """Extratcs features from a source sentence and outputs trainig data entries.
+
+  Args:
+    source_filename (str): A file path to the source sentences.
+    entries_filename (str): A file path to the output entries.
+  """
+  chunks = line.strip().split(utils.SEP)
+  chunk_lengths = [len(chunk) for chunk in chunks]
+  sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
+  sentence = ''.join(chunks)
+  p1 = utils.Result.UNKNOWN.value
+  p2 = utils.Result.UNKNOWN.value
+  p3 = utils.Result.UNKNOWN.value
+  lines = []
+  for i in range(1, len(sentence) + 1):
+    feature = feature_extractor.get_feature(
+        sentence[i - 3] if i > 2 else utils.INVALID,
+        sentence[i - 2] if i > 1 else utils.INVALID, sentence[i - 1],
+        sentence[i] if i < len(sentence) else utils.INVALID,
+        sentence[i + 1] if i + 1 < len(sentence) else utils.INVALID,
+        sentence[i + 2] if i + 2 < len(sentence) else utils.INVALID, p1, p2, p3)
+    positive = i in sep_indices
+    p = utils.Result.POSITIVE.value if positive else utils.Result.NEGATIVE.value
+    lines.append('\t'.join(['1' if positive else '-1'] + feature) + '\n')
+    p1 = p2
+    p2 = p3
+    p3 = p
+  with open(entries_filename, 'a', encoding=sys.getdefaultencoding()) as f:
+    f.write(''.join(lines))
 
 
 def main() -> None:
@@ -31,10 +65,15 @@ def main() -> None:
       default='encoded_data.txt')
   args = parser.parse_args()
   source_filename = args.source_data
-  train_data_filename = args.outfile
-  feature_extractor.process(source_filename, train_data_filename)
+  entries_filename = args.outfile
+  with open(source_filename, encoding=sys.getdefaultencoding()) as f:
+    data = f.readlines()
+  with open(entries_filename, 'w', encoding=sys.getdefaultencoding()) as f:
+    f.write('')
+  for line in data:
+    process(line, entries_filename)
   print('\033[92mEncoded training data is output to: %s\033[0m' %
-        (train_data_filename))
+        entries_filename)
 
 
 if __name__ == '__main__':

diff --git a/scripts/load_knbc.py b/scripts/load_knbc.py
@@ -21,7 +21,7 @@
 import urllib.request
 from html.parser import HTMLParser
 
-from context import utils
+from budoux import utils
 
 RESOURCE_URL = (
     'https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2')

diff --git a/tests/test_encode_data.py b/tests/test_encode_data.py
@@ -0,0 +1,86 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests the data encoder script."""
+
+import os
+import sys
+import unittest
+from pathlib import Path
+
+from budoux import utils
+
+# module hack
+LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
+sys.path.insert(0, os.path.abspath(LIB_PATH))
+from scripts import encode_data  # type: ignore # noqa (module hack)
+
+ENTRIES_FILE_PATH = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), 'entries_test.txt'))
+
+
+class TestEncodeData(unittest.TestCase):
+
+  def setUp(self) -> None:
+    Path(ENTRIES_FILE_PATH).touch()
+
+  def test_process(self) -> None:
+    separated_sentence = f'これは{utils.SEP}美しい{utils.SEP}ペンです。'
+    encode_data.process(separated_sentence, ENTRIES_FILE_PATH)
+    with open(
+        ENTRIES_FILE_PATH, encoding=sys.getdefaultencoding(),
+        errors='replace') as f:
+      entries = f.read().splitlines()
+    original_sentence = ''.join(separated_sentence.split(utils.SEP))
+    self.assertEqual(
+        len(entries), len(original_sentence),
+        'Should start making entries from the first character.')
+
+    labels = [int(entry.split('\t')[0]) for entry in entries]
+    self.assertListEqual(
+        labels,
+        [
+            -1,  # こ
+            -1,  # れ
+            1,  # は
+            -1,  # 美
+            -1,  # し
+            1,  # い
+            -1,  # ペ
+            -1,  # ン
+            -1,  # で
+            -1,  # す
+            1  # 。
+        ],
+        'The first column of entries should be labels.')
+
+    features = [set(entry.split('\t')[1:]) for entry in entries]
+    self.assertIn(
+        'UW3:こ', features[0],
+        'The first feature set should include the first character as the UW3 feature.'
+    )
+    self.assertIn(
+        'UW3:れ', features[1],
+        'The second feature set should include the second character as the UW3 feature.'
+    )
+    self.assertIn(
+        'UW3:は', features[2],
+        'The third feature set should include the third character as the UW3 feature.'
+    )
+    self.assertIn(
+        'UW3:。', features[-1],
+        'The last feature set should include the last character as the UW3 feature.'
+    )
+
+  def tearDown(self) -> None:
+    os.remove(ENTRIES_FILE_PATH)
diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py
@@ -13,39 +13,14 @@
 # limitations under the License.
 """Tests methods for the feature extractor."""
 
-import io
-import os
-import sys
 import typing
 import unittest
-from pathlib import Path
 
-# module hack
-LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
-sys.path.insert(0, os.path.abspath(LIB_PATH))
-
-from budoux import feature_extractor, utils  # noqa (module hack)
-
-if isinstance(sys.stdin, io.TextIOWrapper):
-  sys.stdin.reconfigure(encoding='utf-8')
-
-if isinstance(sys.stdout, io.TextIOWrapper):
-  sys.stdout.reconfigure(encoding='utf-8')
-
-SOURCE_FILE_PATH = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), 'source_test.txt'))
-ENTRIES_FILE_PATH = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), 'entries_test.txt'))
+from budoux import feature_extractor, utils
 
 
 class TestFeatureExtractor(unittest.TestCase):
 
-  def setUp(self) -> None:
-    Path(ENTRIES_FILE_PATH).touch()
-    self.test_entry = f'これは{utils.SEP}美しい{utils.SEP}ペンです。'
-    with open(SOURCE_FILE_PATH, 'w', encoding=sys.getdefaultencoding()) as f:
-      f.write(self.test_entry)
-
   def test_unicode_block_index(self) -> None:
 
     def check(character: str, block: str, msg: str) -> None:
@@ -146,57 +121,6 @@ def find_by_prefix(prefix: str, feature: typing.List[str]) -> bool:
         find_by_prefix('BB2:', feature),
         'Should omit the Unicode feature that covers an invalid character.')
 
-  def test_process(self) -> None:
-    feature_extractor.process(SOURCE_FILE_PATH, ENTRIES_FILE_PATH)
-    with open(
-        ENTRIES_FILE_PATH, encoding=sys.getdefaultencoding(),
-        errors='replace') as f:
-      entries = f.read().splitlines()
-    test_sentence = ''.join(self.test_entry.split(utils.SEP))
-    self.assertEqual(
-        len(entries), len(test_sentence),
-        'Should start making entries from the first character.')
-
-    labels = [int(entry.split('\t')[0]) for entry in entries]
-    self.assertListEqual(
-        labels,
-        [
-            -1,  # こ
-            -1,  # れ
-            1,  # は
-            -1,  # 美
-            -1,  # し
-            1,  # い
-            -1,  # ペ
-            -1,  # ン
-            -1,  # で
-            -1,  # す
-            1  # 。
-        ],
-        'The first column of entries should be labels.')
-
-    features = [set(entry.split('\t')[1:]) for entry in entries]
-    self.assertIn(
-        'UW3:こ', features[0],
-        'The first feature set should include the first character as the UW3 feature.'
-    )
-    self.assertIn(
-        'UW3:れ', features[1],
-        'The second feature set should include the second character as the UW3 feature.'
-    )
-    self.assertIn(
-        'UW3:は', features[2],
-        'The third feature set should include the third character as the UW3 feature.'
-    )
-    self.assertIn(
-        'UW3:。', features[-1],
-        'The last feature set should include the last character as the UW3 feature.'
-    )
-
-  def tearDown(self) -> None:
-    os.remove(SOURCE_FILE_PATH)
-    os.remove(ENTRIES_FILE_PATH)
-
 
 if __name__ == '__main__':
   unittest.main()