-
Notifications
You must be signed in to change notification settings - Fork 1
/
normalizer.py
118 lines (105 loc) · 4 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""This module includes Normalizer class for normalizing texts"""
from __future__ import annotations
from typing import List
from .char_config import CharConfig
from .mappings import MappingDict
from .nltk_tokenizer import NltkTokenizer, Tokenizer
# pylint: disable=too-few-public-methods
class Normalizer:
"""
A class for normalizer.
...
Attributes
----------
configs : List[str]
list of desired configs
remove_extra_spaces : bool
that determines spaces stick together or not
tokenization : bool
tokenize text or not
Methods
-------
normalize(text: str):
get a text and normalize it and finally return it
"""
def __init__(self, configs=None, remove_extra_spaces: bool = True, tokenization: bool = True,
tokenizer: Tokenizer = None):
"""
constructor
:param configs : List[str]
list of desired configs
:param remove_extra_spaces : bool
that determines spaces stick together or not
:param tokenization : bool
tokenize text or not
"""
# Create a blank Tokenizer with just the English vocab
if configs is None:
configs = []
self.__configs = configs
self.__remove_extra_spaces = remove_extra_spaces
self.__mapping = MappingDict.load_jsons(self.__configs)
if tokenization:
if tokenizer:
self.__tokenizer = tokenizer
else:
self.__tokenizer = NltkTokenizer()
else:
self.__tokenizer = None
# pylint: disable=too-many-branches
def normalize(self, text: str) -> str:
"""
return a normalized text
:param text: the input text
:return: normalized text
"""
if self.__tokenizer:
is_token_list = self.__tokenize(text)
else:
is_token_list = [True] * len(text)
result = ""
last = None
for i, char in enumerate(text):
is_token = is_token_list[i]
mapping_char = self.__mapping.get(char)
if not self.__remove_extra_spaces:
if mapping_char and \
(not mapping_char.is_token or (mapping_char.is_token and is_token)):
char = mapping_char.char
result += char
else:
current = mapping_char if mapping_char else CharConfig(char)
if current.is_space:
if last is None:
last = current
elif not last.is_space and last.space_after is not False:
last = current
elif last.is_space and current.space_priority < last.space_priority:
last = current
else:
if last and last.is_space and last.space_before is not False:
result += last.char
# If last char is not space and need space before current or after last
if last and last.is_space is not True and \
(current.space_before or last.space_after) and is_token:
result += " "
if not current.is_token or (current.is_token and is_token):
result += current.char
else:
result += char
last = current
if last and last.is_space:
result += last.char
return result
def __tokenize(self, text: str) -> List[bool]:
"""
return list of boolean that specifies each character is token or not
:param text: the input text
:return: list boolean.
"""
is_token_list = [False] * len(text)
spans = self.__tokenizer.span_tokenize(text)
for (start, end) in spans:
if start + 1 == end:
is_token_list[start] = True
return is_token_list