Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolves #111 carefully escape tag names #112

Merged
merged 1 commit into from Mar 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 24 additions & 4 deletions sanitize.go
Expand Up @@ -229,7 +229,7 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

case html.StartTagToken:

mostRecentlyStartedToken = strings.ToLower(token.Data)
mostRecentlyStartedToken = normaliseElementName(token.Data)

aps, ok := p.elsAndAttrs[token.Data]
if !ok {
Expand Down Expand Up @@ -272,7 +272,7 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

case html.EndTagToken:

if mostRecentlyStartedToken == strings.ToLower(token.Data) {
if mostRecentlyStartedToken == normaliseElementName(token.Data) {
mostRecentlyStartedToken = ""
}

Expand Down Expand Up @@ -350,11 +350,11 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

if !skipElementContent {
switch mostRecentlyStartedToken {
case "script":
case `script`:
// not encouraged, but if a policy allows JavaScript we
// should not HTML escape it as that would break the output
buff.WriteString(token.Data)
case "style":
case `style`:
// not encouraged, but if a policy allows CSS styles we
// should not HTML escape it as that would break the output
buff.WriteString(token.Data)
Expand Down Expand Up @@ -887,3 +887,23 @@ func (p *Policy) matchRegex(elementName string) (map[string]attrPolicy, bool) {
}
return aps, matched
}


// normaliseElementName takes a HTML element like <script> which is user input
// and returns a lower case version of it that is immune to UTF-8 to ASCII
// conversion tricks (like the use of upper case cyrillic i scrİpt which a
// strings.ToLower would convert to script). Instead this func will preserve
// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
// characters when lower cased
func normaliseElementName(str string) string {
// that useful QuoteToASCII put quote marks at the start and end
// so those are trimmed off
return strings.TrimSuffix(
strings.TrimPrefix(
strings.ToLower(
strconv.QuoteToASCII(str),
),
`"`),
`"`,
)
}
42 changes: 42 additions & 0 deletions sanitize_test.go
Expand Up @@ -1678,3 +1678,45 @@ func TestIssue85NoReferrer(t *testing.T) {
}
wg.Wait()
}



func TestIssue111ScriptTags(t *testing.T) {
p1 := NewPolicy()
p2 := UGCPolicy()
p3 := UGCPolicy().AllowElements("script")

in := `<scr\u0130pt>&lt;script&gt;alert(document.domain)&lt;/script&gt;`
expected := `&lt;script&gt;alert(document.domain)&lt;/script&gt;`
out := p1.Sanitize(in)
if out != expected {
t.Errorf(
"test failed;\ninput : %s\noutput : %s\nexpected: %s",
in,
out,
expected,
)
}

expected = `&lt;script&gt;alert(document.domain)&lt;/script&gt;`
out = p2.Sanitize(in)
if out != expected {
t.Errorf(
"test failed;\ninput : %s\noutput : %s\nexpected: %s",
in,
out,
expected,
)
}

expected = `&lt;script&gt;alert(document.domain)&lt;/script&gt;`
out = p3.Sanitize(in)
if out != expected {
t.Errorf(
"test failed;\ninput : %s\noutput : %s\nexpected: %s",
in,
out,
expected,
)
}
}