/
test_mechanize_page_encoding.rb
215 lines (159 loc) · 6.18 KB
/
test_mechanize_page_encoding.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# -*- coding: utf-8 -*-
# frozen_string_literal: true
require 'mechanize/test_case'
# tests for Page encoding and charset and parsing
class TestMechanizePageEncoding < Mechanize::TestCase
MECH_ASCII_ENCODING = 'US-ASCII'
def setup
super
@uri = URI('http://localhost/')
@response_headers = { 'content-type' => 'text/html' }
@body = +'<title>hi</title>'
end
def util_page body = @body, headers = @response_headers
Mechanize::Page.new @uri, headers, body && body.force_encoding(Encoding::BINARY), 200, @mech
end
def test_page_charset
charset = Mechanize::Page.charset 'text/html;charset=vAlue'
assert_equal 'vAlue', charset
charset = Mechanize::Page.charset 'text/html;charset=vaLue, text/html'
assert_equal 'vaLue', charset
charset = Mechanize::Page.charset 'text/html ; charset = valUe, text/html'
assert_equal 'valUe', charset
end
def test_page_charset_upcase
charset = Mechanize::Page.charset 'TEXT/HTML;CHARSET=UTF-8'
assert_equal 'UTF-8', charset
end
def test_page_charset_semicolon
charset = Mechanize::Page.charset 'text/html;charset=UTF-8;'
assert_equal 'UTF-8', charset
end
def test_page_charset_no_chaset_token
charset = Mechanize::Page.charset 'text/html'
assert_nil charset
end
def test_page_charset_returns_nil_when_charset_says_none
charset = Mechanize::Page.charset 'text/html;charset=none'
assert_nil charset
end
def test_page_charset_multiple
charset = Mechanize::Page.charset 'text/html;charset=111;charset=222'
assert_equal '111', charset
end
def test_page_response_header_charset
headers = { 'content-type' => 'text/html;charset=HEADER' }
charsets = Mechanize::Page.response_header_charset(headers)
assert_equal ['HEADER'], charsets
end
def test_page_response_header_charset_no_token
headers = {'content-type' => 'text/html'}
charsets = Mechanize::Page.response_header_charset(headers)
assert_equal [], charsets
headers = {'X-My-Header' => 'hello'}
charsets = Mechanize::Page.response_header_charset(headers)
assert_equal [], charsets
end
def test_page_response_header_charset_wrong_header
headers = { 'x-content-type' => 'text/html;charset=bogus' }
charsets = Mechanize::Page.response_header_charset(headers)
assert_equal [], charsets
end
def test_response_header_charset
page = util_page nil, {'content-type' => 'text/html;charset=HEADER'}
assert_equal ['HEADER'], page.response_header_charset
end
def test_page_meta_charset
body = '<meta http-equiv="content-type" content="text/html;charset=META">'
charsets = Mechanize::Page.meta_charset(body)
assert_equal ['META'], charsets
end
def test_page_meta_charset_is_empty_when_no_charset_meta
body = '<meta http-equiv="refresh" content="5; url=index.html">'
charsets = Mechanize::Page.meta_charset(body)
assert_equal [], charsets
end
def test_page_meta_charset_no_content
body = '<meta http-equiv="content-type">'
charsets = Mechanize::Page.meta_charset(body)
assert_empty charsets
end
# Test to fix issue: https://github.com/sparklemotion/mechanize/issues/143
def test_page_meta_charset_handles_whitespace
body = '<meta http-equiv = "Content-Type" content = "text/html; charset=iso-8859-1">'
charsets = Mechanize::Page.meta_charset(body)
assert_equal ["iso-8859-1"], charsets
end
def test_meta_charset
body = +'<meta http-equiv="content-type" content="text/html;charset=META">'
page = util_page body
assert_equal ['META'], page.meta_charset
end
def test_detected_encoding
page = util_page
assert_equal MECH_ASCII_ENCODING, page.detected_encoding
end
def test_encodings
response = {'content-type' => 'text/html;charset=HEADER'}
body = +'<meta http-equiv="content-type" content="text/html;charset=META">'
@mech.default_encoding = 'DEFAULT'
page = util_page body, response
assert_equal true, page.encodings.include?('HEADER')
assert_equal true, page.encodings.include?('META')
assert_equal true, page.encodings.include?(MECH_ASCII_ENCODING)
assert_equal true, page.encodings.include?('DEFAULT')
end
def test_parser_with_default_encoding
# pre test
assert_equal false, util_page.encodings.include?('Windows-1252')
@mech.default_encoding = 'Windows-1252'
page = util_page
assert_equal true, page.encodings.include?('Windows-1252')
end
def test_parser_force_default_encoding
@mech.default_encoding = 'Windows-1252'
@mech.force_default_encoding = true
page = util_page
assert page.encodings.include? 'Windows-1252'
end
def test_parser_encoding_equals_overwrites_force_default_encoding
@mech.default_encoding = 'Windows-1252'
@mech.force_default_encoding = true
page = util_page
assert_equal 'Windows-1252', page.encoding
page.encoding = 'ISO-8859-2'
assert_equal 'ISO-8859-2', page.encoding
end
def test_parser_encoding_when_searching_elements
skip "Encoding not implemented" unless have_encoding?
body = +'<span id="latin1">hi</span>'
page = util_page body, 'content-type' => 'text/html,charset=ISO-8859-1'
result = page.search('#latin1')
assert_equal Encoding::UTF_8, result.text.encoding
end
def test_parser_error_message_containing_encoding_errors
skip if RUBY_ENGINE == 'jruby' # this is a libxml2-specific condition
# https://github.com/sparklemotion/mechanize/issues/553
body = +<<~EOF
<html>
<body>
<!--
## メモ
処理の一般化, 二重ループ, 多重ループ
wzxhzdk:25
-->
EOF
page = util_page body
# this should not raise an "invalid byte sequence in UTF-8" error while processing parsing errors
page.search("body")
# let's assert on the setup: a libxml2-returned parsing error itself contains an invalid character
# note that this problem only appears in libxml <= 2.9.10
error = page.parser.errors.find { |e| e.message.include?("Comment not terminated") }
if error
exception = assert_raises(ArgumentError) do
error.message =~ /any regex just to trigger encoding error/
end
assert_includes(exception.message, "invalid byte sequence in UTF-8")
end
end
end