forked from sparklemotion/nokogiri
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_document_encoding.rb
143 lines (118 loc) · 4.76 KB
/
test_document_encoding.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
require "helper"
module Nokogiri
module HTML
class TestDocumentEncoding < Nokogiri::TestCase
def test_encoding
doc = Nokogiri::HTML File.open(SHIFT_JIS_HTML, 'rb')
hello = "こんにちは"
assert_match doc.encoding, doc.to_html
assert_match hello.encode('Shift_JIS'), doc.to_html
assert_equal 'Shift_JIS', doc.to_html.encoding.name
assert_match hello, doc.to_html(:encoding => 'UTF-8')
assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8')
assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
end
def test_encoding_without_charset
doc = Nokogiri::HTML File.open(SHIFT_JIS_NO_CHARSET, 'r:Shift_JIS:Shift_JIS').read
hello = "こんにちは"
assert_match hello, doc.content
assert_match hello, doc.to_html(:encoding => 'UTF-8')
assert_match 'UTF-8', doc.to_html(:encoding => 'UTF-8').encoding.name
end
def test_default_to_encoding_from_string
bad_charset = <<-eohtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
</head>
<body>
<a href="http://tenderlovemaking.com/">blah!</a>
</body>
</html>
eohtml
doc = Nokogiri::HTML(bad_charset)
assert_equal bad_charset.encoding.name, doc.encoding
doc = Nokogiri.parse(bad_charset)
assert_equal bad_charset.encoding.name, doc.encoding
end
def test_encoding_non_utf8
orig = '日本語が上手です'
bin = Encoding::ASCII_8BIT
[Encoding::Shift_JIS, Encoding::EUC_JP].each do |enc|
html = <<-eohtml.encode(enc)
<html>
<meta http-equiv="Content-Type" content="text/html; charset=#{enc.name}">
<title xml:lang="ja">#{orig}</title></html>
eohtml
text = Nokogiri::HTML.parse(html).at('title').inner_text
assert_equal(
orig.encode(enc).force_encoding(bin),
text.encode(enc).force_encoding(bin)
)
end
end
def test_encoding_with_a_bad_name
bad_charset = <<-eohtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=charset=UTF-8">
</head>
<body>
<a href="http://tenderlovemaking.com/">blah!</a>
</body>
</html>
eohtml
doc = Nokogiri::HTML(bad_charset, nil, 'askldjfhalsdfjhlkasdfjh')
assert_equal ['http://tenderlovemaking.com/'],
doc.css('a').map { |a| a['href'] }
end
def test_empty_doc_encoding
encoding = 'US-ASCII'
assert_equal encoding, Nokogiri::HTML.parse(nil, nil, encoding).encoding
end
end
class TestDocumentEncodingDetection < Nokogiri::TestCase
def binread(file)
IO.binread(file)
end
def binopen(file)
File.open(file, 'rb')
end
def test_document_html_noencoding
from_stream = Nokogiri::HTML(binopen(NOENCODING_FILE))
from_string = Nokogiri::HTML(binread(NOENCODING_FILE))
assert_equal from_string.to_s.size, from_stream.to_s.size
end
def test_document_html_charset
html = Nokogiri::HTML(binopen(METACHARSET_FILE))
assert_equal 'iso-2022-jp', html.encoding
assert_equal 'たこ焼き仮面', html.title
end
def test_document_xhtml_enc
[ENCODING_XHTML_FILE, ENCODING_HTML_FILE].each { |file|
doc_from_string_enc = Nokogiri::HTML(binread(file), nil, 'Shift_JIS')
ary_from_string_enc = doc_from_string_enc.xpath('//p/text()').map(&:text)
doc_from_string = Nokogiri::HTML(binread(file))
ary_from_string = doc_from_string.xpath('//p/text()').map(&:text)
doc_from_file_enc = Nokogiri::HTML(binopen(file), nil, 'Shift_JIS')
ary_from_file_enc = doc_from_file_enc.xpath('//p/text()').map(&:text)
doc_from_file = Nokogiri::HTML(binopen(file))
ary_from_file = doc_from_file.xpath('//p/text()').map(&:text)
title = 'たこ焼き仮面'
assert_equal(title, doc_from_string_enc.at('//title/text()').text)
assert_equal(title, doc_from_string.at('//title/text()').text)
assert_equal(title, doc_from_file_enc.at('//title/text()').text)
assert_equal(title, doc_from_file.at('//title/text()').text)
evil = (0..72).map { |i| '超' * i + '悪い事を構想中。' }
assert_equal(evil, ary_from_string_enc)
assert_equal(evil, ary_from_string)
assert_equal(evil, ary_from_file_enc)
assert_equal(evil, ary_from_file)
}
end
end
end
end