Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitizing search entry titles #3560

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/user-guide/writing-your-docs.md
Expand Up @@ -538,3 +538,16 @@ Note that fenced code blocks can not be indented. Therefore, they cannot be
nested inside list items, blockquotes, etc.

[fenced code blocks]: https://python-markdown.github.io/extensions/fenced_code_blocks/

### Search Keywords

The [search plugin][] supports defining keywords for individual sections of a page. When search terms match the defined keywords, it ensures that the relevant section will be included in the search results. To use the feature, [enable][markdown_extensions] the [attr_list][] extension to Markdown.

To define keywords for a section, assign a string of space separated words to the `data-search-keywords` attribute of any heading. Specifically, define an attribute list at the end of a heading which contains the attribute.

```markdown
# Section Title {data-search-keywords='space separated list of words'}
```

[search plugin]: configuration.md#search
[attr_list]: https://python-markdown.github.io/extensions/attr_list/
1 change: 1 addition & 0 deletions mkdocs/contrib/search/prebuild-index.js
Expand Up @@ -45,6 +45,7 @@ stdin.on('end', function () {
}
this.field('title');
this.field('text');
this.field('keywords', {'boost': 10});
this.ref('location');

data.docs.forEach(function (doc) {
Expand Down
81 changes: 45 additions & 36 deletions mkdocs/contrib/search/search_index.py
Expand Up @@ -10,7 +10,6 @@

if TYPE_CHECKING:
from mkdocs.structure.pages import Page
from mkdocs.structure.toc import AnchorLink, TableOfContents

try:
from lunr import lunr # type: ignore
Expand All @@ -32,25 +31,12 @@ def __init__(self, **config) -> None:
self._entries: list[dict] = []
self.config = config

def _find_toc_by_id(self, toc, id_: str | None) -> AnchorLink | None:
"""
Given a table of contents and HTML ID, iterate through
and return the matched item in the TOC.
"""
for toc_item in toc:
if toc_item.id == id_:
return toc_item
toc_item_r = self._find_toc_by_id(toc_item.children, id_)
if toc_item_r is not None:
return toc_item_r
return None

def _add_entry(self, title: str | None, text: str, loc: str) -> None:
def _add_entry(self, title: str | None, text: str, keywords: str | None, loc: str) -> None:
"""A simple wrapper to add an entry, dropping bad characters."""
text = text.replace('\u00a0', ' ')
text = re.sub(r'[ \t\n\r\f\v]+', ' ', text.strip())

self._entries.append({'title': title, 'text': text, 'location': loc})
self._entries.append({'title': title, 'text': text, 'keywords': keywords, 'location': loc})

def add_entry_from_context(self, page: Page) -> None:
"""
Expand All @@ -72,25 +58,21 @@ def add_entry_from_context(self, page: Page) -> None:

# Create an entry for the full page.
text = parser.stripped_html.rstrip('\n') if self.config['indexing'] == 'full' else ''
self._add_entry(title=page.title, text=text, loc=url)
self._add_entry(title=page.title, text=text, keywords='', loc=url)

if self.config['indexing'] in ['full', 'sections']:
for section in parser.data:
self.create_entry_for_section(section, page.toc, url)
self.create_entry_for_section(section, url)

def create_entry_for_section(
self, section: ContentSection, toc: TableOfContents, abs_url: str
) -> None:
def create_entry_for_section(self, section: ContentSection, abs_url: str) -> None:
"""
Given a section on the page, the table of contents and
the absolute url for the page create an entry in the
index.
Given a section of a page and the absolute url for the page
create an entry in the index.
"""
toc_item = self._find_toc_by_id(toc, section.id)

text = ' '.join(section.text) if self.config['indexing'] == 'full' else ''
if toc_item is not None:
self._add_entry(title=toc_item.title, text=text, loc=abs_url + toc_item.url)
self._add_entry(
title=section.title, text=text, keywords=section.keywords, loc=f'{abs_url}#{section.id}'
)

def generate_search_index(self) -> str:
"""Python to json conversion."""
Expand Down Expand Up @@ -122,7 +104,7 @@ def generate_search_index(self) -> str:
if haslunrpy:
lunr_idx = lunr(
ref='location',
fields=('title', 'text'),
fields=('title', 'text', dict(field_name='keywords', boost=10)),
documents=self._entries,
languages=self.config['lang'],
)
Expand Down Expand Up @@ -150,13 +132,26 @@ def __init__(
text: list[str] | None = None,
id_: str | None = None,
title: str | None = None,
keywords: str | None = None,
) -> None:
self.text = text or []
self.id = id_
self.title = title
self.title = title or ''
self.keywords = keywords or ''
oprypin marked this conversation as resolved.
Show resolved Hide resolved

def __eq__(self, other):
return self.text == other.text and self.id == other.id and self.title == other.title
return (
self.text == other.text
and self.id == other.id
and self.title == other.title
and self.keywords == other.keywords
)

def __repr__(self):
return (
f"{self.__class__.__name__}("
f"text={self.text}, id='{self.id}', title='{self.title}', keywords='{self.keywords}')"
waylan marked this conversation as resolved.
Show resolved Hide resolved
)


_HEADER_TAGS = tuple(f"h{x}" for x in range(1, 7))
Expand All @@ -175,10 +170,17 @@ def __init__(self, *args, **kwargs) -> None:
self.data: list[ContentSection] = []
self.section: ContentSection | None = None
self.is_header_tag = False
self.is_permalink = False
self._stripped_html: list[str] = []

def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
"""Called at the start of every HTML tag."""
atts = dict(attrs)
waylan marked this conversation as resolved.
Show resolved Hide resolved
# Check for permalink in header
if self.is_header_tag and tag == 'a' and 'headerlink' in (atts.get('class') or ''):
waylan marked this conversation as resolved.
Show resolved Hide resolved
self.is_permalink = True
return

# We only care about the opening tag for headings.
if tag not in _HEADER_TAGS:
return
Expand All @@ -187,14 +189,17 @@ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None
# for it and assign the ID if it has one.
self.is_header_tag = True
self.section = ContentSection()
self.section.id = atts.get('id')
self.section.keywords = atts.get('data-search-keywords') or ''
self.data.append(self.section)

for attr in attrs:
if attr[0] == "id":
self.section.id = attr[1]

def handle_endtag(self, tag: str) -> None:
"""Called at the end of every HTML tag."""
# Check for permalinks
if self.is_permalink and tag == 'a':
self.is_permalink = False
return

# We only care about the opening tag for headings.
if tag not in _HEADER_TAGS:
return
Expand All @@ -203,6 +208,10 @@ def handle_endtag(self, tag: str) -> None:

def handle_data(self, data: str) -> None:
"""Called for the text contents of each tag."""
# Do not retain permalink text.
if self.is_permalink:
return

self._stripped_html.append(data)

if self.section is None:
Expand All @@ -216,7 +225,7 @@ def handle_data(self, data: str) -> None:
# Otherwise it is content of something under that header
# section.
if self.is_header_tag:
self.section.title = data
self.section.title = self.section.title + data
waylan marked this conversation as resolved.
Show resolved Hide resolved
else:
self.section.text.append(data.rstrip('\n'))

Expand Down
4 changes: 2 additions & 2 deletions mkdocs/contrib/search/templates/search/main.js
Expand Up @@ -28,7 +28,7 @@ function escapeHtml (value) {
.replace(/>/g, '>');
}

function formatResult (location, title, summary) {
function formatResult (location, title, summary, keywords) {
return '<article><h3><a href="' + joinUrl(base_url, location) + '">'+ escapeHtml(title) + '</a></h3><p>' + escapeHtml(summary) +'</p></article>';
}

Expand All @@ -40,7 +40,7 @@ function displayResults (results) {
if (results.length > 0){
for (var i=0; i < results.length; i++){
var result = results[i];
var html = formatResult(result.location, result.title, result.summary);
var html = formatResult(result.location, result.title, result.summary, result.keywords);
search_results.insertAdjacentHTML('beforeend', html);
}
} else {
Expand Down
1 change: 1 addition & 0 deletions mkdocs/contrib/search/templates/search/worker.js
Expand Up @@ -77,6 +77,7 @@ function onScriptsLoaded () {
}
this.field('title');
this.field('text');
this.field('keywords', {'boost': 10});
this.ref('location');

for (var i=0; i < data.docs.length; i++) {
Expand Down
64 changes: 34 additions & 30 deletions mkdocs/tests/search_tests.py
Expand Up @@ -9,8 +9,7 @@
from mkdocs.contrib.search import search_index
from mkdocs.structure.files import File
from mkdocs.structure.pages import Page
from mkdocs.structure.toc import get_toc
from mkdocs.tests.base import dedent, get_markdown_toc, load_config
from mkdocs.tests.base import dedent, load_config


def strip_whitespace(string):
Expand Down Expand Up @@ -283,7 +282,23 @@ def test_content_parser(self):
parser.close()

self.assertEqual(
parser.data, [search_index.ContentSection(text=["TEST"], id_="title", title="Title")]
parser.data,
[search_index.ContentSection(text=["TEST"], id_="title", title="Title", keywords='')],
)

def test_content_parser_header_has_child(self):
parser = search_index.ContentParser()

parser.feed('<h1 id="title">Title <span>title</span> TITLE</h1>TEST')
parser.close()

self.assertEqual(
parser.data,
[
search_index.ContentSection(
text=["TEST"], id_="title", title="Title title TITLE", keywords=''
)
],
)

def test_content_parser_no_id(self):
Expand All @@ -293,7 +308,8 @@ def test_content_parser_no_id(self):
parser.close()

self.assertEqual(
parser.data, [search_index.ContentSection(text=["TEST"], id_=None, title="Title")]
parser.data,
[search_index.ContentSection(text=["TEST"], id_=None, title="Title", keywords='')],
)

def test_content_parser_content_before_header(self):
Expand All @@ -303,7 +319,8 @@ def test_content_parser_content_before_header(self):
parser.close()

self.assertEqual(
parser.data, [search_index.ContentSection(text=["TEST"], id_=None, title="Title")]
parser.data,
[search_index.ContentSection(text=["TEST"], id_=None, title="Title", keywords='')],
)

def test_content_parser_no_sections(self):
Expand All @@ -313,30 +330,20 @@ def test_content_parser_no_sections(self):

self.assertEqual(parser.data, [])

def test_find_toc_by_id(self):
"""Test finding the relevant TOC item by the tag ID."""
index = search_index.SearchIndex()

md = dedent(
"""
# Heading 1
## Heading 2
### Heading 3
"""
)
toc = get_toc(get_markdown_toc(md))

toc_item = index._find_toc_by_id(toc, "heading-1")
self.assertEqual(toc_item.url, "#heading-1")
self.assertEqual(toc_item.title, "Heading 1")
def test_data_search_keywords(self):
parser = search_index.ContentParser()

toc_item2 = index._find_toc_by_id(toc, "heading-2")
self.assertEqual(toc_item2.url, "#heading-2")
self.assertEqual(toc_item2.title, "Heading 2")
parser.feed('<h1 id="title" data-search-keywords="search keywords">Title</h1>TEST')
parser.close()

toc_item3 = index._find_toc_by_id(toc, "heading-3")
self.assertEqual(toc_item3.url, "#heading-3")
self.assertEqual(toc_item3.title, "Heading 3")
self.assertEqual(
parser.data,
[
search_index.ContentSection(
text=["TEST"], id_="title", title="Title", keywords="search keywords"
)
],
)

def test_create_search_index(self):
html_content = """
Expand Down Expand Up @@ -369,7 +376,6 @@ def test_create_search_index(self):
### Heading 3
"""
)
toc = get_toc(get_markdown_toc(md))

full_content = ''.join(f"Heading{i}Content{i}" for i in range(1, 4))

Expand All @@ -379,7 +385,6 @@ def test_create_search_index(self):
for page in pages:
# Fake page.read_source() and page.render()
page.markdown = md
page.toc = toc
page.content = html_content

index = search_index.SearchIndex(**plugin.config)
Expand Down Expand Up @@ -425,7 +430,6 @@ def test_page(title, filename, config):
## Heading 2
### Heading 3"""
)
test_page.toc = get_toc(get_markdown_toc(test_page.markdown))
return test_page

def validate_full(data, page):
Expand Down