-
Notifications
You must be signed in to change notification settings - Fork 12
/
main_generate_csv.py
141 lines (111 loc) · 4.11 KB
/
main_generate_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from rich import print
from rich.console import Console
from rich.progress import Progress
from rich.progress import track
from rich.table import Table
import pdfplumber
from spacy.lang.de import German
from spacy.lang.en import English
from spacy.lang.fr import French
from spacy.lang.it import Italian
import glob
import os
import re
import sys
from datetime import datetime, timedelta
replacement_map = {
'«': '"',
'»': '"',
'“': '"',
'„': '"'
}
if __name__ == '__main__':
console = Console()
table = Table()
table.add_column("TSS CSV Generator", style="cyan")
table.add_row("This tool extracts texts from the sample folder, splits them into sentence and creates a metadata.csv to be used with the generator.")
table.add_row("2021 - padmalcom")
table.add_row("www.stonedrum.de")
console.print(table)
app_folder = os.path.dirname(os.path.realpath(__file__))
now = datetime.now()
project_folder = os.path.join(app_folder, 'project_' + now.strftime("%d%m%Y_%H%M%S"))
console.print("Please select a [red]project folder[/red] (default [i]%s[/i])." % project_folder)
in_project_folder = input()
if not in_project_folder:
in_project_folder = project_folder
project_folder = in_project_folder
if not os.path.exists(project_folder):
os.mkdir(project_folder)
console.print("Project folder is %s" % project_folder)
# Select languages
console.print("Please select a [red]language[/red] (default [i]%s[/i])." % 'de')
in_lang = input()
if not in_lang:
in_lang = 'de'
console.print("Language is set to [red]%s[/red]." % in_lang)
# Select file types to read
console.print("Please select the file types you want to load (t=text, p=pdf, tp=both) (default [i]tp[/i])")
in_file_types = input()
if not in_file_types:
in_file_types = "tp"
if not 't' in in_file_types and not 'p' in in_file_types:
in_file_types = "tp"
# Display number of text files
if 't' in in_file_types:
text_files = glob.glob(os.path.join(app_folder, 'texts', in_lang + '*.txt'))
console.print("Found %d text files for %s" % (len(text_files), in_lang))
# 1. Read text files
all_texts = ''
for tf in text_files:
f = open(tf, "r", encoding= 'utf-8')
all_texts += f.read() + '\n'
# Display number of pdf files
if 'p' in in_file_types:
pdf_files = glob.glob(os.path.join(app_folder, 'texts', in_lang + '*.pdf'))
console.print("Found %d pdf files for %s" % (len(pdf_files), in_lang))
for pdf_file in pdf_files:
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
all_texts += page_text + '\n'
# split texts by sentences
if in_lang == 'de':
nlp = German()
elif in_lang == 'en':
nlp = English()
elif in_lang == 'fr':
nlp = French()
elif in_lang == 'it':
nlp = Italian()
else:
console.print("The language %s is not supported yet. Please create a github issue." % in_lang)
sys.exit(0)
nlp.add_pipe('sentencizer')
# Split every 100000 characters
split_text = [all_texts[i:i+100000] for i in range(0, len(all_texts), 100000)]
all_sentences = []
for st in split_text:
doc = nlp(st)
all_sentences.extend([str(sent).strip() for sent in doc.sents])
console.print("Found %d sentences." % len(all_sentences))
# Write metadata.csv
csv_file_name = 'metadata.csv'
csv_file_path = os.path.join(project_folder, csv_file_name)
csv_file = open(csv_file_path, 'a', encoding = 'utf-8')
# todo cleanse text
for index, sentence in enumerate(all_sentences):
sentence = sentence.replace("\n", " ")
sentence = sentence.replace("\t", " ")
# clean text
cleansed_sentence = sentence
for character, replacement in replacement_map.items():
cleansed_sentence = cleansed_sentence.replace(character, replacement)
wav_file_name = (str(index) + '.wav').rjust(12, '0')
if len(cleansed_sentence) > 5:
csv_file.write(wav_file_name + "|" + sentence + "|" + cleansed_sentence + '\n')
csv_file.close()
duration_in_seconds = len(all_sentences) * 4
duration = timedelta(seconds=duration_in_seconds)
console.print("%d sentence written to %s. These are approximately %s of audio." % (len(all_sentences), csv_file_name, duration))