Skip to content

Commit

Permalink
Refactor abbr Extension
Browse files Browse the repository at this point in the history
A new `AbbrTreeprocessor` has been introduced, which replaces the now
deprecated `AbbrInlineProcessor`. Abbreviation processing now happens
after Attribute Lists, avoiding a conflict between the two extensions.
Fixes #1460.

The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which
better reflects what it is. `AbbrPreprocessor` has been deprecated.

A call to `Markdown.reset()` now clears all previously defined abbreviations.
  • Loading branch information
waylan committed Apr 25, 2024
1 parent 993b57b commit ec8c305
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 15 deletions.
13 changes: 13 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [unreleased]

### Changed

#### Refactor `abbr` Extension

A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated
`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists,
avoiding a conflict between the two extensions (#1460).

The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which
better reflects what it is. `AbbrPreprocessor` has been deprecated.

A call to `Markdown.reset()` now clears all previously defined abbreviations.

### Fixed

* Fixed links to source code on GitHub from the documentation (#1453).
Expand Down
89 changes: 75 additions & 14 deletions markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,41 +25,102 @@
from . import Extension
from ..blockprocessors import BlockProcessor
from ..inlinepatterns import InlineProcessor
from ..util import AtomicString
from ..treeprocessors import Treeprocessor
from ..util import AtomicString, deprecated
from typing import TYPE_CHECKING
import re
import xml.etree.ElementTree as etree

if TYPE_CHECKING: # pragma: no cover
from .. import Markdown
from ..blockparsers import BlockParser


class AbbrExtension(Extension):
""" Abbreviation Extension for Python-Markdown. """

def extendMarkdown(self, md):
""" Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """
md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
def __init__(self, **kwargs):
""" Initiate Extension and set up configs. """
super().__init__(**kwargs)
self.abbrs = {}

def reset(self):
""" Clear all previously defined abbreviations. """
self.abbrs.clear()

class AbbrPreprocessor(BlockProcessor):
""" Abbreviation Preprocessor - parse text for abbr references. """
def extendMarkdown(self, md):
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
md.registerExtension(self)
md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)


class AbbrTreeprocessor(Treeprocessor):
""" Replace abbreviation text with `<abbr>` elements. """

def __init__(self, md: Markdown | None = None, abbrs: dict | None = None):
self.abbrs: dict = abbrs if abbrs is not None else {}
self.RE: re.RegexObject | None = None
super().__init__(md)

def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None:
''' Recursively iterate over elements, run regex on text and wrap matches in `abbr` tags. '''
for child in reversed(el):
self.iter_element(child, el)
if text := el.text:
for m in reversed(list(self.RE.finditer(text))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
el.text = text
if parent and el.tail:
tail = el.tail
index = list(parent).index(el) + 1
for m in reversed(list(self.RE.finditer(tail))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = tail[m.end():]
parent.insert(index, abbr)
tail = tail[:m.start()]
el.tail = tail

def run(self, root: etree.Element) -> etree.Element | None:
''' Step through tree to find known abbreviations. '''
if not self.abbrs:
# No abbreviations defined. Skip running processor.
return
# Build and compile regex
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
# Step through tree and modify on matches
self.iter_element(root)


class AbbrBlockprocessor(BlockProcessor):
""" Parse text for abbreviation references. """

RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)

def __init__(self, parser: BlockParser, abbrs: dict):
self.abbrs: dict = abbrs
super().__init__(parser)

def test(self, parent: etree.Element, block: str) -> bool:
return True

def run(self, parent: etree.Element, blocks: list[str]) -> bool:
"""
Find and remove all Abbreviation references from the text.
Each reference is set as a new `AbbrPattern` in the markdown instance.
Find and remove all abbreviation references from the text.
Each reference is added to the abbreviation collection.
"""
block = blocks.pop(0)
m = self.RE.search(block)
if m:
abbr = m.group('abbr').strip()
title = m.group('title').strip()
self.parser.md.inlinePatterns.register(
AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
)
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
Expand All @@ -71,11 +132,11 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
blocks.insert(0, block)
return False

def _generate_pattern(self, text: str) -> str:
""" Given a string, returns a regex pattern to match that string. """
return f"(?P<abbr>\\b{ re.escape(text) }\\b)"

AbbrPreprocessor = deprecated("This class has been renamed to `AbbrBlockprocessor`.")(AbbrBlockprocessor)


@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.")
class AbbrInlineProcessor(InlineProcessor):
""" Abbreviation inline pattern. """

Expand Down
71 changes: 70 additions & 1 deletion tests/test_syntax/extensions/test_abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
"""

from markdown.test_tools import TestCase
from markdown import Markdown
from markdown.extensions.abbr import AbbrExtension


class TestAbbr(TestCase):
Expand Down Expand Up @@ -60,7 +62,7 @@ def test_abbr_lower(self):
)
)

def test_abbr_multiple(self):
def test_abbr_multiple_in_text(self):
self.assertMarkdownRenders(
self.dedent(
"""
Expand All @@ -79,6 +81,44 @@ def test_abbr_multiple(self):
)
)

def test_abbr_multiple_in_tail(self):
self.assertMarkdownRenders(
self.dedent(
"""
*The* HTML specification
is maintained by the W3C.
*[HTML]: Hyper Text Markup Language
*[W3C]: World Wide Web Consortium
"""
),
self.dedent(
"""
<p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification
is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
"""
)
)

def test_abbr_multiple_nested(self):
self.assertMarkdownRenders(
self.dedent(
"""
The *HTML* specification
is maintained by the *W3C*.
*[HTML]: Hyper Text Markup Language
*[W3C]: World Wide Web Consortium
"""
),
self.dedent(
"""
<p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification
is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p>
"""
)
)

def test_abbr_override(self):
self.assertMarkdownRenders(
self.dedent(
Expand Down Expand Up @@ -325,3 +365,32 @@ def test_abbr_bracket(self):
"""
)
)

def test_abbr_with_attr_list(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]: Abbreviation Definition
![Image with abbr in title](abbr.png){title="Image with abbr in title"}
"""
),
self.dedent(
"""
<p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p>
"""
),
extensions=['abbr', 'attr_list']
)

def test_abbr_reset(self):
ext = AbbrExtension()
md = Markdown(extensions=[ext])
md.convert('*[abbr]: Abbreviation Definition')
self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition'})
md.convert('*[ABBR]: Capitalised Abbreviation')
self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition', 'ABBR': 'Capitalised Abbreviation'})
md.reset()
self.assertEqual(ext.abbrs, {})
md.convert('*[foo]: Foo Definition')
self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'})

0 comments on commit ec8c305

Please sign in to comment.