Skip to content

Commit

Permalink
Optimizing document sanitizer
Browse files Browse the repository at this point in the history
Corrector is currently using slow and deprecated (mozilla/bleach#698) bleach. Based on the fact that X-Road metrics should not contain HTML it would be more beneficial to just use python translate method and replace potentially dangerous HTML characters. Translate does not parse html and estimated to be 100 times faster than bleach.

Using translate method instead of bleach.clean.

Renaming sanitise -> sanitize to be consistent with the rest of the code.
  • Loading branch information
VitaliStupin committed Nov 29, 2023
1 parent 55a6dbb commit 23a8d23
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 18 deletions.
8 changes: 4 additions & 4 deletions corrector_module/opmon_corrector/corrector_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def consume_data(self, data):
x_request_id = data['x_request_id']
documents = []
for _doc in data['documents']:
sanitised_doc = doc_m.sanitise_document(_doc)
fix_doc = doc_m.correct_structure(sanitised_doc)
sanitized_doc = doc_m.sanitize_document(_doc)
fix_doc = doc_m.correct_structure(sanitized_doc)
documents.append(fix_doc)
duplicates = 0

Expand Down Expand Up @@ -163,8 +163,8 @@ def consume_faulty_data(self, data):
# Get parameters
# logger_manager = data['logger_manager']
doc_m = data['document_manager']
sanitised_doc = doc_m.sanitise_document(data['document'])
fixed_doc = doc_m.correct_structure(sanitised_doc)
sanitized_doc = doc_m.sanitize_document(data['document'])
fixed_doc = doc_m.correct_structure(sanitized_doc)
producer = fixed_doc if (
fixed_doc['securityServerType'].lower() == SECURITY_SERVER_TYPE_PRODUCER) else None
client = fixed_doc if (
Expand Down
25 changes: 16 additions & 9 deletions corrector_module/opmon_corrector/document_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import bleach

from opmon_corrector import (SECURITY_SERVER_TYPE_CLIENT,
SECURITY_SERVER_TYPE_PRODUCER, __version__)
from opmon_corrector.logger_manager import LoggerManager
Expand Down Expand Up @@ -309,17 +307,26 @@ def create_json(client_document, producer_document, x_request_id):
}

@staticmethod
def sanitise_document(document: dict) -> dict:
def escape_html(value: str) -> str:
"""
Escape html to avoid potential XSS attacks during log processing.
:param value: The string to be escaped.
:return: Returns escaped string.
"""
return value.translate(str.maketrans({'<': '&lt;', '>': '&gt;', '&': '&amp;'}))

@staticmethod
def sanitize_document(document: dict) -> dict:
"""
Sanitizes the document by cleaning string values using bleach if they are present.
:param document: The document to be sanitised.
:return: Returns the sanitised document.
Sanitizes the document by HTML escaping string values if they are present.
:param document: The document to be sanitized.
:return: Returns the sanitized document.
"""
sanitised_document = {
key: bleach.clean(value) if isinstance(value, str) else value
sanitized_document = {
key: DocumentManager.escape_html(value) if isinstance(value, str) else value
for key, value in document.items()
}
return sanitised_document
return sanitized_document

def correct_structure(self, doc):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -530,17 +530,17 @@ def test_correct_structure(mock_logger_manager, basic_settings):
assert all([v is None for v in doc.values()])


def test_sanitise_document(mock_logger_manager, basic_settings):
def test_sanitize_document(mock_logger_manager, basic_settings):
dm = DocumentManager(basic_settings)
sanitised_doc = dm.sanitise_document({
sanitized_doc = dm.sanitize_document({
'field1': '<img src/onerror=prompt(8)>',
'field2': '</scrip</script>t><img src =q onerror=prompt(8)>',
'field3': 100,
'field4': True,
'field5': None,
'field6': {'sub': 'test'}
})
assert sanitised_doc == {
assert sanitized_doc == {
'field1': '&lt;img src/onerror=prompt(8)&gt;',
'field2': '&lt;/scrip&lt;/script&gt;t&gt;&lt;img src =q onerror=prompt(8)&gt;',
'field3': 100,
Expand Down
3 changes: 1 addition & 2 deletions corrector_module/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
requirements = [
'setuptools==67.4.0',
'pymongo==3.10.1',
'pyyaml==5.4.1',
'bleach==6.0.0'
'pyyaml==5.4.1'
]

classifiers = [
Expand Down

0 comments on commit 23a8d23

Please sign in to comment.