Skip to content

Commit

Permalink
Resolves #111 carefully escape tag names
Browse files Browse the repository at this point in the history
A regression of #56 occurred in which the use of ToLower allowed a
Cyrillic upper-case I to be sanitised to a standard ASCII i and
this would then permit SCRIPT tags to be injected.
  • Loading branch information
buro9 committed Mar 27, 2021
1 parent 3cce251 commit 524f142
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 4 deletions.
28 changes: 24 additions & 4 deletions sanitize.go
Expand Up @@ -229,7 +229,7 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

case html.StartTagToken:

mostRecentlyStartedToken = strings.ToLower(token.Data)
mostRecentlyStartedToken = normaliseElementName(token.Data)

aps, ok := p.elsAndAttrs[token.Data]
if !ok {
Expand Down Expand Up @@ -272,7 +272,7 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

case html.EndTagToken:

if mostRecentlyStartedToken == strings.ToLower(token.Data) {
if mostRecentlyStartedToken == normaliseElementName(token.Data) {
mostRecentlyStartedToken = ""
}

Expand Down Expand Up @@ -350,11 +350,11 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

if !skipElementContent {
switch mostRecentlyStartedToken {
case "script":
case `script`:
// not encouraged, but if a policy allows JavaScript we
// should not HTML escape it as that would break the output
buff.WriteString(token.Data)
case "style":
case `style`:
// not encouraged, but if a policy allows CSS styles we
// should not HTML escape it as that would break the output
buff.WriteString(token.Data)
Expand Down Expand Up @@ -887,3 +887,23 @@ func (p *Policy) matchRegex(elementName string) (map[string]attrPolicy, bool) {
}
return aps, matched
}


// normaliseElementName takes a HTML element like <script> which is user input
// and returns a lower case version of it that is immune to UTF-8 to ASCII
// conversion tricks (like the use of upper case cyrillic i scrİpt which a
// strings.ToLower would convert to script). Instead this func will preserve
// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
// characters when lower cased
func normaliseElementName(str string) string {
// that useful QuoteToASCII put quote marks at the start and end
// so those are trimmed off
return strings.TrimSuffix(
strings.TrimPrefix(
strings.ToLower(
strconv.QuoteToASCII(str),
),
`"`),
`"`,
)
}
42 changes: 42 additions & 0 deletions sanitize_test.go
Expand Up @@ -1678,3 +1678,45 @@ func TestIssue85NoReferrer(t *testing.T) {
}
wg.Wait()
}



func TestIssue111ScriptTags(t *testing.T) {
p1 := NewPolicy()
p2 := UGCPolicy()
p3 := UGCPolicy().AllowElements("script")

in := `<scr\u0130pt>&lt;script&gt;alert(document.domain)&lt;/script&gt;`
expected := `&lt;script&gt;alert(document.domain)&lt;/script&gt;`
out := p1.Sanitize(in)
if out != expected {
t.Errorf(
"test failed;\ninput : %s\noutput : %s\nexpected: %s",
in,
out,
expected,
)
}

expected = `&lt;script&gt;alert(document.domain)&lt;/script&gt;`
out = p2.Sanitize(in)
if out != expected {
t.Errorf(
"test failed;\ninput : %s\noutput : %s\nexpected: %s",
in,
out,
expected,
)
}

expected = `&lt;script&gt;alert(document.domain)&lt;/script&gt;`
out = p3.Sanitize(in)
if out != expected {
t.Errorf(
"test failed;\ninput : %s\noutput : %s\nexpected: %s",
in,
out,
expected,
)
}
}

0 comments on commit 524f142

Please sign in to comment.