Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor chm_htmlescape() #5862

Merged
merged 1 commit into from
Dec 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Bugs fixed
* #5834: apidoc: wrong help for ``--tocfile``
* #5800: todo: crashed if todo is defined in TextElement
* #5846: htmlhelp: convert hex escaping to decimal escaping in .hhc/.hhk files
* htmlhelp: broken .hhk file generated when title contains a double quote

Testing
--------
Expand Down
34 changes: 16 additions & 18 deletions sphinx/builders/htmlhelp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@

import codecs
import os
import re
from os import path

from docutils import nodes
from six import PY3

from sphinx import addnodes
from sphinx.builders.html import StandaloneHTMLBuilder
Expand Down Expand Up @@ -170,22 +170,21 @@
}


def chm_htmlescape(*args, **kwargs):
# type: (*Any, **Any) -> unicode
def chm_htmlescape(s, quote=None):
# type: (unicode, bool) -> unicode
"""
chm_htmlescape() is a wrapper of htmlescape().
chm_htmlescape() is a wrapper of html.escape().
.hhc/.hhk files don't recognize hex escaping, we need convert
hex escaping to decimal escaping. for example: `'` -> `'`
htmlescape() may generates a hex escaping `'` for single
quote `'`, this wrapper fixes this.
hex escaping to decimal escaping. for example: ``'`` -> ``'``
html.escape() may generates a hex escaping ``'`` for single
quote ``'``, this wrapper fixes this.
"""
def convert(matchobj):
# type: (Match[unicode]) -> unicode
codepoint = int(matchobj.group(1), 16)
return '&#%d;' % codepoint
return re.sub(r'&#[xX]([0-9a-fA-F]+);',
convert,
htmlescape(*args, **kwargs))
if quote is None:
quote = PY3 # True for py3, False for py2 (for compatibility)

s = htmlescape(s, quote)
s = s.replace(''', ''') # re-escape as decimal
return s


class HTMLHelpBuilder(StandaloneHTMLBuilder):
Expand Down Expand Up @@ -297,7 +296,7 @@ def write_toc(node, ullevel=0):
write_toc(subnode, ullevel)
elif isinstance(node, nodes.reference):
link = node['refuri']
title = chm_htmlescape(node.astext()).replace('"', '"')
title = chm_htmlescape(node.astext(), True)
f.write(object_sitemap % (title, link))
elif isinstance(node, nodes.bullet_list):
if ullevel != 0:
Expand Down Expand Up @@ -327,10 +326,9 @@ def write_index(title, refs, subitems):
# type: (unicode, List[Tuple[unicode, unicode]], List[Tuple[unicode, List[Tuple[unicode, unicode]]]]) -> None # NOQA
def write_param(name, value):
# type: (unicode, unicode) -> None
item = ' <param name="%s" value="%s">\n' % \
(name, value)
item = ' <param name="%s" value="%s">\n' % (name, value)
f.write(item)
title = chm_htmlescape(title)
title = chm_htmlescape(title, True)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I add 2nd argument to fix the bug. see CHANGES.

f.write('<LI> <OBJECT type="text/sitemap">\n')
write_param('Keyword', title)
if len(refs) == 0:
Expand Down
30 changes: 29 additions & 1 deletion tests/test_build_htmlhelp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
"""
test_build_htmlhelp
~~~~~~~~~~~~~~~~~~~
Expand All @@ -9,6 +10,9 @@
import re

import pytest
from six import PY2

from sphinx.builders.htmlhelp import chm_htmlescape


@pytest.mark.sphinx('htmlhelp', testroot='build-htmlhelp')
Expand All @@ -22,5 +26,29 @@ def test_chm(app):
with open(hhk_path, 'rb') as f:
data = f.read()
m = re.search(br'&#[xX][0-9a-fA-F]+;', data)
assert m == None, 'Hex escaping exists in .hhk file: ' + str(m.group(0))
assert m is None, 'Hex escaping exists in .hhk file: ' + str(m.group(0))


def test_chm_htmlescape():
assert chm_htmlescape('Hello world') == 'Hello world'
assert chm_htmlescape(u'Unicode 文字') == u'Unicode 文字'
assert chm_htmlescape('&#x45') == '&amp;#x45'

if PY2:
assert chm_htmlescape('<Hello> "world"') == '&lt;Hello&gt; "world"'
assert chm_htmlescape('<Hello> "world"', True) == '&lt;Hello&gt; &quot;world&quot;'
assert chm_htmlescape('<Hello> "world"', False) == '&lt;Hello&gt; "world"'
else:
assert chm_htmlescape('<Hello> "world"') == '&lt;Hello&gt; &quot;world&quot;'
assert chm_htmlescape('<Hello> "world"', True) == '&lt;Hello&gt; &quot;world&quot;'
assert chm_htmlescape('<Hello> "world"', False) == '&lt;Hello&gt; "world"'

if PY2:
# single quotes are not escaped on py2 (following the behavior of cgi.escape())
assert chm_htmlescape("Hello 'world'") == "Hello 'world'"
assert chm_htmlescape("Hello 'world'", True) == "Hello 'world'"
assert chm_htmlescape("Hello 'world'", False) == "Hello 'world'"
else:
assert chm_htmlescape("Hello 'world'") == "Hello &#39;world&#39;"
assert chm_htmlescape("Hello 'world'", True) == "Hello &#39;world&#39;"
assert chm_htmlescape("Hello 'world'", False) == "Hello 'world'"