forked from empenguinxh/Anki-CreateImportFile
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_zhuji.py
187 lines (184 loc) · 8.74 KB
/
convert_zhuji.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# coding:utf-8
import re
import json
import codecs
import functools
import os.path
from random import random
from random import randint
from pprint import pprint
from copy import deepcopy
from my_helpers import *
file_zhuji = "base_data\GREHe Xin Ci Hui Zhu Ji Yu Jing - Cao Tian Cheng.txt"
match_escape_char_re = re.compile(r'\\(?=[\[\]()*+])')
match_zhuji_list_start_re = re.compile(ur'### List \d+', re.M)
def get_etyma_block_d_l_l(list_data_l):
match_etyma_block_start = re.compile(r'^\d+\.(.*)$\n|^Unit \d+$', re.M)
etyma_block_d_l_l = []
for list_index, base_list_str in enumerate(list_data_l):
if list_index > 38:
break
etyma_block_d_l = []
base_list_str = base_list_str.split(u'小结&复习')[0]
etyma_block_str_l = extract_content_between(base_list_str, match_etyma_block_start)
for etyma_index, etyma_block_str in enumerate(etyma_block_str_l):
ety_str = match_etyma_block_start.search(etyma_block_str).group(1)
if ety_str is None:
ety_str = ''
ety_str = ety_str.strip()
if ety_str == u'其他':
#print u'词根是其他'
ety_str = ''
if list_index == 36-1:
ety_str = u'与动物有关的单词,' + ety_str
ety_str = ety_str.strip()
etyma_block_str_and_summary_str = etyma_block_str.split(u'小结')
summary_str = etyma_block_str_and_summary_str[1] if len(etyma_block_str_and_summary_str) == 2 else ''
etyma_block_str = match_etyma_block_start.sub('', etyma_block_str_and_summary_str[0])
# revise surg, cit
if ety_str == 'surg, cit':
temp_str_l = etyma_block_str.split('\n')
#iter_print(temp_str_l)
# insert line 5 after line 0
modified_str_l = [temp_str_l[0], temp_str_l[5]] + temp_str_l[1:5] + temp_str_l[6:]
etyma_block_str = '\n'.join(modified_str_l)
#print etyma_block_str
# revise rejoice
if ety_str == u'欢乐与喜悦':
temp_str_l = etyma_block_str.split('\n')
#iter_print(temp_str_l)
modified_str_l = [temp_str_l[0], temp_str_l[9]] + temp_str_l[1:9]
etyma_block_str = '\n'.join(modified_str_l)
#print etyma_block_str
etyma_block_d = {'pos':(list_index+1, etyma_index+1),
'ety': ety_str,
'ety_block_str': etyma_block_str,
'summary': summary_str}
etyma_block_d_l.append(etyma_block_d)
etyma_block_d_l_l.append(etyma_block_d_l)
return etyma_block_d_l_l
def revise_miss_etyma(base_d_l_l):
# revise list 25 etyma 3 revise tum
base_d_l_l[25-1][3-1]['ety'] = 'tum'
# revise list 5 etyma 4 revise post, pound
base_d_l_l[5-1][4-1]['ety'] = 'post, pound'
# revise list 6 etyma 7 revise vad, vag, ced
base_d_l_l[6-1][7-1]['ety'] = 'vad, vag, ced'
match_cognate_block_start_re = re.compile(ur'^([a-zéï-]+)(.*?)(\[.*\])$', re.M|re.I)
def process_ety_block_str(base_d_l_l):
path_to_ety_block_str = [('all','',True),('all','',True),('key','ety_block_str',False)]
for list_index, ety_index, ety_block_str in iter_through_general(base_d_l_l,
path_to_ety_block_str):
etyma_block_d = base_d_l_l[list_index][ety_index]
returned_l = extract_content_between(ety_block_str, match_cognate_block_start_re, True)
ety_group_exp = returned_l.pop(0).strip()
etyma_block_d['etyma_group_explanation'] = ety_group_exp
etyma_block_d['cognate_block_str_l'] = returned_l
# revise List 13, ety 3 revise scru
def revise_scru(base_d_l_l):
'''
please only call it one time
or re-run the code cells starting from
"zhuji_base_d_l_l = get_etyma_block_d_l_l(zhuji_base_list_l)"
'''
to_revise_l = base_d_l_l[13-1][3-1]['cognate_block_str_l']
#iter_print(to_revise_l)
# remove element 3-5 and build new dict
new_l = to_revise_l[3:]
to_revise_l[2] = to_revise_l[2].replace(u'以下的4个单词可以将scru按照读音联想成“四顾”,表示“ (顾虑地) 看”。', '')
to_revise_l = to_revise_l[:3]
new_ety = 'scru'
new_ety_group_exp = u'将scru按照读音联想成“四顾”,表示“ (顾虑地) 看”'
new_ety_d = {'cognate_block_str_l': new_l, 'pos': (13, 3),
'ety': new_ety,
'etyma_group_explanation': new_ety_group_exp,
'summary':'', 'ety_block_str':''}
base_d_l_l[13-1].append(new_ety_d)
def process_cognate_block(cognate_block_str):
cognate_dict = {}
cognate_lines_l = cognate_block_str.split('\n')
first_line_match = match_cognate_block_start_re.match(cognate_lines_l.pop(0))
word = first_line_match.group(1)
if (word == '') or (word is None):
print 'Warning!'
cognate_dict['word'] = word
phon = first_line_match.group(3)
cognate_dict['phon'] = phon if not (phon is None) else ''
modified_cognate_lines_l = []
for cognate_line in cognate_lines_l:
cognate_line = cognate_line.strip()
if cognate_line == '':
pass
elif cognate_line.startswith(u'源'):
# revise 源
cognate_line = cognate_line.replace(u'源', u'[源]')
# print cognate_line
elif cognate_dict['word'] == u'facilitate':
pass
elif cognate_dict['word'] in ['jocular', 'jocund', 'jovial', 'rejoice']:
pass
elif cognate_line.startswith(u'以下两个单词中'):
pass
elif not cognate_line.startswith(u'['):
# test
print 'current line:', cognate_line, '\ncurrent block\n', cognate_block_str
break
else:
pass
modified_cognate_lines_l.append(cognate_line)
cognate_dict['content'] = '\n'.join(modified_cognate_lines_l)
return cognate_dict
def process_all_cognate_block(base_data_d_l_l):
base_word_d = {}
path_to_cognate_block_str = [('all','',True),('all','',True),
('key','cognate_block_str_l',False),('all','',True)]
for list_index, eytma_index, cognate_index, cognate_block_str in iter_through_general(base_data_d_l_l,
path_to_cognate_block_str):
one_word_d = process_cognate_block(cognate_block_str)
word = one_word_d['word']
for _key in ['pos', 'ety', 'etyma_group_explanation', 'summary']:
one_word_d[_key] = base_data_d_l_l[list_index][eytma_index][_key]
one_word_d['pos'] = ', '.join([unicode(i) for i in one_word_d['pos']])
one_word_d['etyma_cognates_l'] = '' # waiting to be filled later
if word in base_word_d:
print 'Warning! word already exists!', word
base_word_d[word] = one_word_d
return base_word_d
def add_etyma_cognates_l(base_word_d, base_d_l_l):
path_to_etyma_d = [('all','',False),('all','',False)]
for etyma_d, in iter_through_general(base_d_l_l, path_to_etyma_d):
ety_str = etyma_d['ety']
ety_group_exp = etyma_d['etyma_group_explanation']
if ety_str != '' or ety_group_exp != '':
if ety_str == '':
# test
print ety_group_exp
etyma_cognates_l = []
for cognate_block_str in etyma_d['cognate_block_str_l']:
word = match_cognate_block_start_re.match(cognate_block_str).group(1)
etyma_cognates_l.append(word)
for word in etyma_cognates_l:
base_word_d[word]['etyma_cognates_l'] = ', '.join(etyma_cognates_l)
def main(file_name=None):
if file_name is None:
file_name = file_zhuji
# for module call
if not os.path.isfile(file_name):
return
zhuji_base_str = codecs_open_r_utf8(file_zhuji)
zhuji_base_str = match_escape_char_re.sub('', zhuji_base_str)
zhuji_base_str = collapse_blank_line(zhuji_base_str)
with codecs.open('temp_zhuji_base_str.txt', 'w', encoding='utf-8') as f:
f.write(zhuji_base_str)
zhuji_base_str = zhuji_base_str.split(u'# 第二篇 核心词汇练习')[0]
zhuji_base_list_l = extract_content_between(zhuji_base_str, match_zhuji_list_start_re)
zhuji_base_d_l_l = get_etyma_block_d_l_l(zhuji_base_list_l)
revise_miss_etyma(zhuji_base_d_l_l)
process_ety_block_str(zhuji_base_d_l_l)
revise_scru(zhuji_base_d_l_l)
zhuji_base_word_d = process_all_cognate_block(zhuji_base_d_l_l)
add_etyma_cognates_l(zhuji_base_word_d, zhuji_base_d_l_l)
with codecs.open('zhuji_base_d.txt', 'w', encoding='utf-8') as f:
json.dump(zhuji_base_word_d, f)
if __name__ == '__main__':
main()