-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_test.py
314 lines (284 loc) · 15 KB
/
scrape_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import pytest
from bs4 import BeautifulSoup, Tag
from scrape import extract_center_info
def create_tags(name: str, details_html: str) -> tuple[Tag, Tag]:
name_html = f'<p class="entryName">{name}</p>'
def get_tag(html: str) -> Tag:
res = BeautifulSoup(html, "html.parser").p
assert res is not None
return res
return get_tag(name_html), get_tag(details_html)
def test_basic() -> None:
tags = create_tags(
" Accidental Buddhist Sangha",
"""<p class="entryDetail">
<strong>Address:</strong> IL <br>
<strong>Tradition:</strong> Mahayana, Zen Buddhist Master Thich Nhat Hahn<br>
<strong>Affiliation:</strong> Community of Mindful Living/Order of Interbeing <br>
<strong>Phone:</strong> (630) 375-0881<br>
<strong>E-mail:</strong> <a href="mailto:jackhat1@aol.com">jackhat1@aol.com</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/ Illinois" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Contact:</strong> Jack Hatfield <br>
</p>
""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Accidental Buddhist Sangha",
"page": 1,
"Address": "IL",
"Tradition": "Mahayana, Zen Buddhist Master Thich Nhat Hahn",
"Affiliation": "Community of Mindful Living/Order of Interbeing",
"Phone": "(630) 375-0881",
"E-mail": "jackhat1@aol.com",
"Contact": "Jack Hatfield",
}
def test_html_elements_in_only_part_of_the_value() -> None:
# This removes the `Address`, which is tested elsewhere.
tags = create_tags(
"Alaska Buddhist Center - Rimay Tenzin Ling",
"""<p class="entryDetail">
<strong>Tradition:</strong> Vajrayana, Tibetan,Gelugpa<br>
<strong>Phone:</strong> (907) 374-3200<br>
<strong>E-mail:</strong> <a href="mailto:alaskabuddhistcenter@gmail.com">alaskabuddhistcenter@gmail.com</a><br>
<strong>Website:</strong> <a href="http://www.alaskabuddhistcenter.org/">http://www.alaskabuddhistcenter.org/</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/Physical: 4448 Pikes Landing Road (UUFF building) Fairbanks, Alaska Mailing: P.O. Box 60062 Fairbanks 99706 Alaska" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Main Contact:</strong> nevillejacobs@gmail.com <i>(Phone: 907.456.4780)</i><br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Alaska Buddhist Center - Rimay Tenzin Ling",
"page": 1,
"Tradition": "Vajrayana, Tibetan,Gelugpa",
"Phone": "(907) 374-3200",
"E-mail": "alaskabuddhistcenter@gmail.com",
"Website": "http://www.alaskabuddhistcenter.org/",
"Main Contact": "nevillejacobs@gmail.com (Phone: 907.456.4780)",
}
def test_value_in_key_strong_element() -> None:
# Removes the address, already tested elsewhere with other centers.
tags = create_tags(
"American Young Buddhist Association",
"""<p class="entryDetail">
<strong>Tradition:</strong> Mahayana, Humanistic Buddhism<br>
<strong>Find on:</strong> <a href="http://mapof.it/3456 Glenmark Drive Hacienda Heights 91745 California" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Contact: Vice-secretary General:</strong> Ven. Hui-Chuang <br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "American Young Buddhist Association",
"page": 1,
"Tradition": "Mahayana, Humanistic Buddhism",
"Contact": "Vice-secretary General: Ven. Hui-Chuang",
}
def test_two_entries_for_key() -> None:
# This removes the Notes section, which is tested elsewhere.
tags = create_tags(
"Albuquerque Vipassana Sangha",
"""<p class="entryDetail">
<strong>Address:</strong> Albuquerque NM 87196 <br>
<strong>Tradition:</strong> Theravada, Vipassana (Insight Meditation)<br>
<strong>Website:</strong> <a href="http://abqsangha.org">http://abqsangha.org</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/ Albuquerque 87196 New Mexico" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Community Dharma Leader:</strong> Kathryn Turnipseed <br>
<strong>Community Dharma Leader:</strong> Valerie Roth <br>
""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Albuquerque Vipassana Sangha",
"page": 1,
"Address": "Albuquerque NM 87196",
"Tradition": "Theravada, Vipassana (Insight Meditation)",
"Website": "http://abqsangha.org",
"Community Dharma Leader": "Kathryn Turnipseed, Valerie Roth",
}
def test_a_element_uses_href() -> None:
tags = create_tags(
"Amitabha Foundation",
"""<p class="entryDetail">
<strong>Address:</strong> 109 Irvington Road Rochester NY 14620<br>
<strong>Tradition:</strong> Vajrayana, Tibetan, Drikung Kagyu<br>
<strong>Affiliation:</strong> Ayang Rinpoche<br>
<strong>Phone:</strong> 585-261-7094<br>
<strong>E-mail:</strong> <a href="mailto:ny@amitabhafoundation.us">ny@amitabhafoundation.us</a><br>
<strong>Website:</strong> <a href="http://www.amitabhafoundation.us">http://www.amitabhafoundation.us</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/109 Irvington Road Rochester 14620 New York" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Main Contact:</strong> Becky <a href="mailto:ny@amitabhafoundation.us">Email</a> <i>(Phone: 585-261-7094)</i><br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Amitabha Foundation",
"page": 1,
"Address": "109 Irvington Road, Rochester NY 14620",
"Tradition": "Vajrayana, Tibetan, Drikung Kagyu",
"Affiliation": "Ayang Rinpoche",
"Phone": "585-261-7094",
"E-mail": "ny@amitabhafoundation.us",
"Website": "http://www.amitabhafoundation.us",
"Main Contact": "Becky ny@amitabhafoundation.us (Phone: 585-261-7094)",
}
def test_space_in_key_name() -> None:
tags = create_tags(
"Aung Mangalar Buddhist Temple",
"""<p class="entryDetail">
<strong>Tradition:</strong> Theravada, Burmese<br>
<strong>Venerable :</strong> Nagasena<br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Aung Mangalar Buddhist Temple",
"page": 1,
"Tradition": "Theravada, Burmese",
"Venerable": "Nagasena",
}
def test_notes_section() -> None:
# This removes the Community Dharma Leader duplicate keys, which is tested elsewhere.
tags = create_tags(
"Albuquerque Vipassana Sangha",
"""<p class="entryDetail">
<strong>Address:</strong> Albuquerque NM 87196 <br>
<strong>Tradition:</strong> Theravada, Vipassana (Insight Meditation)<br>
<strong>Website:</strong> <a href="http://abqsangha.org">http://abqsangha.org</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/ Albuquerque 87196 New Mexico" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Notes and Events:</strong></p><p class="entryDesc">Contact :PO Box 40722 Albuquerque NM 87196<br></p>
""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Albuquerque Vipassana Sangha",
"page": 1,
"Address": "Albuquerque NM 87196",
"Tradition": "Theravada, Vipassana (Insight Meditation)",
"Website": "http://abqsangha.org",
"Notes and Events": "Contact :PO Box 40722 Albuquerque NM 87196",
}
tags = create_tags(
"Dzogchen Community, Colorado",
"""<p class="entryDetail">
<strong>Tradition:</strong> Non-Sectarian, Dzogchen in the Transmission of Namkhai Norbu Rinpoche<br>
<strong>Notes and Events:</strong></p>
<p class="entryDesc"><!--StartFragment--></p>
<p>The Dzogchen Community of Colorado is a group of practitioners dedicated to practicing the transmission given to us by Namkhai Norbu Rinpoche. The Dzogchen, or “Great Perfection” are open to anyone who is interested, regardless of culture or religious tradition.</p>
<p>In the words of our teacher, “The Dzogchen teachings are neither a philosophy, nor a religious doctrine, nor a cultural tradition. Understanding the message of the teachings means discovering one’s own true condition stripped of all the self-deceptions and falsifications that the mind creates. The very meaning of the Tibetan term Dzogchen, ‘Great Perfection,’ refers to the true primordial state of every individual and not to any transcendent reality.” — from Dzogchen, The Self Perfected State by Namkhai Norbu.</p>
<!--EndFragment-->
<p></p>
<p></p>
<hr>
""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Dzogchen Community, Colorado",
"page": 1,
"Tradition": "Non-Sectarian, Dzogchen in the Transmission of Namkhai Norbu Rinpoche",
"Notes and Events": (
"The Dzogchen Community of Colorado is a group of practitioners dedicated to "
"practicing the transmission given to us by Namkhai Norbu Rinpoche. The Dzogchen, "
"or “Great Perfection” are open to anyone who is interested, regardless of culture or "
"religious tradition.\n\n"
"In the words of our teacher, “The Dzogchen teachings are neither a philosophy, nor a "
"religious doctrine, nor a cultural tradition. Understanding the message of the "
"teachings means discovering one’s own true condition stripped of all the "
"self-deceptions and falsifications that the mind creates. The very meaning of the "
"Tibetan term Dzogchen, ‘Great Perfection,’ refers to the true primordial state of "
"every individual and not to any transcendent reality.” — from Dzogchen, The Self "
"Perfected State by Namkhai Norbu."
),
}
def test_address_remove_extra_state() -> None:
tags = create_tags(
"96th Street Sangha",
"""<p class="entryDetail">
<strong>Address:</strong> 275 W. 96th Street, #4C New York, NY 10025 NY <br>
<strong>Tradition:</strong> Mahayana, Zen/Pureland<br>
<strong>Affiliation:</strong> Higashi Honganji<br>
<strong>Phone:</strong> (212) 749-1127<br>
<strong>E-mail:</strong> <a href="mailto:gyobun@aol.com">gyobun@aol.com</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/275 W. 96th Street, #4C New York, NY 10025 New York" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Main Contact:</strong> T. Davis <br>
<strong>Spiritual Director:</strong> Rev. Thulani Davis <br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "96th Street Sangha",
"page": 1,
"Address": "275 W. 96th Street, #4C New York, NY 10025",
"Tradition": "Mahayana, Zen/Pureland",
"Affiliation": "Higashi Honganji",
"Phone": "(212) 749-1127",
"E-mail": "gyobun@aol.com",
"Main Contact": "T. Davis",
"Spiritual Director": "Rev. Thulani Davis",
}
def test_address_remove_whitespace_in_between_street_and_state() -> None:
tags = create_tags(
"Albuquerque Zen Center",
"""<p class="entryDetail">
<strong>Address:</strong> 2300 Garfield SE Albuquerque NM 87106 <br>
<strong>Tradition:</strong> Mahayana, Rinzai Zen<br>
<strong>Phone:</strong> (505) 268-4877<br>
<strong>E-mail:</strong> <a href="mailto:officeazc@gmail.com">officeazc@gmail.com</a><br>
<strong>Website:</strong> <a href="http://www.azc.org">http://www.azc.org</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/2300 Garfield SE Albuquerque 87106 New Mexico" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
<strong>Contact:</strong> Seiju Mammoser <br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Albuquerque Zen Center",
"page": 1,
"Address": "2300 Garfield SE, Albuquerque NM 87106",
"Tradition": "Mahayana, Rinzai Zen",
"Phone": "(505) 268-4877",
"E-mail": "officeazc@gmail.com",
"Website": "http://www.azc.org",
"Contact": "Seiju Mammoser",
}
# Contact removed because it's handled by another test.
tags = create_tags(
"American Young Buddhist Association",
"""<p class="entryDetail">
<strong>Address:</strong> 3456 Glenmark Drive Hacienda Heights CA 91745<br>
<strong>Tradition:</strong> Mahayana, Humanistic Buddhism<br>
<strong>Find on:</strong> <a href="http://mapof.it/3456 Glenmark Drive Hacienda Heights 91745 California" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "American Young Buddhist Association",
"page": 1,
"Address": "3456 Glenmark Drive, Hacienda Heights CA 91745",
"Tradition": "Mahayana, Humanistic Buddhism",
}
tags = create_tags(
"All One Dharma-Quaker House ",
"""<p class="entryDetail">
<strong>Address:</strong> 1440 Harvard Street,
Santa Monica CA 90404<br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "All One Dharma-Quaker House",
"page": 1,
"Address": "1440 Harvard Street, Santa Monica CA 90404",
}
def test_address_remove_mailing_address() -> None:
# This removes the `Main Contact`, which is tested elsewhere.
tags = create_tags(
"Alaska Buddhist Center - Rimay Tenzin Ling",
"""<p class="entryDetail">
<strong>Address:</strong> Physical: 4448 Pikes Landing Road (UUFF building)
Fairbanks, Alaska
Mailing: P.O. Box 60062
Fairbanks AK 99706<br>
<strong>Tradition:</strong> Vajrayana, Tibetan,Gelugpa<br>
<strong>Phone:</strong> (907) 374-3200<br>
<strong>E-mail:</strong> <a href="mailto:alaskabuddhistcenter@gmail.com">alaskabuddhistcenter@gmail.com</a><br>
<strong>Website:</strong> <a href="http://www.alaskabuddhistcenter.org/">http://www.alaskabuddhistcenter.org/</a><br>
<strong>Find on:</strong> <a href="http://mapof.it/Physical: 4448 Pikes Landing Road (UUFF building) Fairbanks, Alaska Mailing: P.O. Box 60062 Fairbanks 99706 Alaska" target="_blank"><img align="absmiddle" src="images/map.gif" border="0" style="margin-top:2px"></a><br>
</p>""",
)
assert extract_center_info(*tags, page_number=1) == {
"name": "Alaska Buddhist Center - Rimay Tenzin Ling",
"page": 1,
"Address": "4448 Pikes Landing Road (UUFF building), Fairbanks, Alaska",
"Tradition": "Vajrayana, Tibetan,Gelugpa",
"Phone": "(907) 374-3200",
"E-mail": "alaskabuddhistcenter@gmail.com",
"Website": "http://www.alaskabuddhistcenter.org/",
}