fix: Only use HTML rules if mimeType matches (#338)

In the living specs for parsing XML and HTML, that this library is trying to implement, there is a distinction between the different types of documents being parsed: There are quite some rules that are different for parsing, constructing and serializing XML vs HTML documents. So far xmldom was always "detecting" whether "the HTML rules should be applied" by looking at the current namespace. So from the first time an the HTML default namespace (`http://www.w3.org/1999/xhtml`) was found, every node was treated as being part of an HTML document. This misconception is the root cause for quite some reported bugs. BREAKING CHANGE: HTML rules are no longer applied just because of the namespace, but require the `mimeType` argument passed to `DOMParser.parseFromString(source, mimeType)` to match `'text/html'`. Doing so implies all rules for handling casing for tag and attribute names when parsing, creation of nodes and searching nodes. BREAKING CHANGE: Correct the return type of `DOMParser.parseFromString` to `Document | undefined`. In case of parsing errors it was always possible that "the returned `Document`" has not been created. In case you are using Typescript you now need to handle those cases. BREAKING CHANGE: The instance property `DOMParser.options` is no longer available, instead use the individual `readonly` property per option (`assign`, `domHandler`, `errorHandler`, `normalizeLineEndings`, `locator`, `xmlns`). Those also provides the default value if the option was not passed. The 'locator' option is now just a boolean (default remains `true`). BREAKING CHANGE: The following methods no longer allow a (non spec compliant) boolean argument to toggle "HTML rules": - `XMLSerializer.serializeToString` - `Node.toString` - `Document.toString` The following interfaces have been implemented: `DOMImplementation` now implements all methods defined in the DOM spec, but not all of the behavior is implemented (see docstring): - `createDocument` creates an "XML Document" (prototype: `Document`, property `type` is `'xml'`) - `createHTMLDocument` creates an "HTML Document" (type/prototype: `Document`, property `type` is `'html'`). - when no argument is passed or the first argument is a string, the basic nodes for an HTML structure are created, as specified - when the first argument is `false` no child nodes are created `Document` now has two new readonly properties as specified in the DOM spec: - `contentType` which is the mime-type that was used to create the document - `type` which is either the string literal `'xml'` or `'html'` `MIME_TYPE` (`/lib/conventions.js`): - `hasDefaultHTMLNamespace` test if the provided string is one of the miem types that implies the default HTML namespace: `text/html` or `application/xhtml+xml`
xmldom · Oct 8, 2022 · 0f41739 · 0f41739
1 parent a701915
commit 0f41739
Show file tree

Hide file tree

Showing 25 changed files with 1,859 additions and 326 deletions.
diff --git a/examples/typescript-node-es6/src/index.ts b/examples/typescript-node-es6/src/index.ts
@@ -7,6 +7,8 @@ const source = `<xml xmlns="a">
 
 const doc = new DOMParser().parseFromString(source, 'text/xml')
 
+if (!doc) throw 'expected Document but was undefined'
+
 const serialized = new XMLSerializer().serializeToString(doc)
 
 if (source !== serialized) {

diff --git a/index.d.ts b/index.d.ts
@@ -1,43 +1,45 @@
 /// <reference lib="dom" />
 
-declare module "@xmldom/xmldom" {
-  var DOMParser: DOMParserStatic;
-  var XMLSerializer: XMLSerializerStatic;
-  var DOMImplementation: DOMImplementationStatic;
-
-  interface DOMImplementationStatic {
-      new(): DOMImplementation;
-  }
-
-  interface DOMParserStatic {
-      new (): DOMParser;
-      new (options: Options): DOMParser;
-  }
-
-  interface XMLSerializerStatic {
-      new (): XMLSerializer;
-  }
-
-  interface DOMParser {
-      parseFromString(xmlsource: string, mimeType?: string): Document;
-  }
-
-  interface XMLSerializer {
-      serializeToString(node: Node): string;
-  }
-
-  interface Options {
-      locator?: any;
-      errorHandler?: ErrorHandlerFunction | ErrorHandlerObject | undefined;
-  }
-
-  interface ErrorHandlerFunction {
-      (level: string, msg: any): any;
-  }
-
-  interface ErrorHandlerObject {
-      warning?: ((msg: any) => any) | undefined;
-      error?: ((msg: any) => any) | undefined;
-      fatalError?: ((msg: any) => any) | undefined;
-  }
+declare module '@xmldom/xmldom' {
+	var DOMParser: DOMParserStatic
+	var XMLSerializer: XMLSerializerStatic
+	var DOMImplementation: DOMImplementationStatic
+
+	interface DOMImplementationStatic {
+		new (): DOMImplementation
+	}
+
+	interface DOMParserStatic {
+		new (): DOMParser
+		new (options: DOMParserOptions): DOMParser
+	}
+
+	interface XMLSerializerStatic {
+		new (): XMLSerializer
+	}
+
+	interface DOMParser {
+		parseFromString(source: string, mimeType?: string): Document | undefined
+	}
+
+	interface XMLSerializer {
+		serializeToString(node: Node): string
+	}
+
+	interface DOMParserOptions {
+		errorHandler?: ErrorHandlerFunction | ErrorHandlerObject
+		locator?: boolean
+		normalizeLineEndings?: (source: string) => string
+		xmlns?: Record<string, string | null | undefined>
+	}
+
+	interface ErrorHandlerFunction {
+		(level: 'warn' | 'error' | 'fatalError', msg: string): void
+	}
+
+	interface ErrorHandlerObject {
+		warning?: (msg: string) => void
+		error?: (msg: string) => void
+		fatalError?: (msg: string) => void
+	}
 }
diff --git a/lib/conventions.js b/lib/conventions.js
@@ -9,7 +9,7 @@
  *
  * @template T
  * @param {T} object the object to freeze
- * @param {Pick<ObjectConstructor, 'freeze'> = Object} oc `Object` by default,
+ * @param {Pick<ObjectConstructor, 'freeze'>} [oc=Object] `Object` by default,
  * 				allows to inject custom object constructor for tests
  * @returns {Readonly<T>}
  *
@@ -47,6 +47,155 @@ function assign(target, source) {
 	return target
 }
 
+/**
+ * A number of attributes are boolean attributes.
+ * The presence of a boolean attribute on an element represents the `true` value,
+ * and the absence of the attribute represents the `false` value.
+ *
+ * If the attribute is present, its value must either be the empty string
+ * or a value that is an ASCII case-insensitive match for the attribute's canonical name,
+ * with no leading or trailing whitespace.
+ *
+ * Note: The values `"true"` and `"false"` are not allowed on boolean attributes.
+ * To represent a `false` value, the attribute has to be omitted altogether.
+ *
+ * @see https://html.spec.whatwg.org/#boolean-attributes
+ * @see https://html.spec.whatwg.org/#attributes-3
+ */
+var HTML_BOOLEAN_ATTRIBUTES = freeze({
+	allowfullscreen: true,
+	async: true,
+	autofocus: true,
+	autoplay: true,
+	checked: true,
+	controls: true,
+	default: true,
+	defer: true,
+	disabled: true,
+	formnovalidate: true,
+	hidden: true,
+	ismap: true,
+	itemscope: true,
+	loop: true,
+	multiple: true,
+	muted: true,
+	nomodule: true,
+	novalidate: true,
+	open: true,
+	playsinline: true,
+	readonly: true,
+	required: true,
+	reversed: true,
+	selected: true,
+})
+
+/**
+ * Check if `name` is matching one of the HTML boolean attribute names.
+ * This method doesn't check if such attributes are allowed in the context of the current document/parsing.
+ *
+ * @param {string} name
+ * @return {boolean}
+ * @see HTML_BOOLEAN_ATTRIBUTES
+ * @see https://html.spec.whatwg.org/#boolean-attributes
+ * @see https://html.spec.whatwg.org/#attributes-3
+ */
+function isHTMLBooleanAttribute(name) {
+	return HTML_BOOLEAN_ATTRIBUTES.hasOwnProperty(name.toLowerCase())
+}
+
+/**
+ * Void elements only have a start tag; end tags must not be specified for void elements.
+ * These elements should be written as self closing like this: `<area />`.
+ * This should not be confused with optional tags that HTML allows to omit the end tag for
+ * (like `li`, `tr` and others), which can have content after them,
+ * so they can not be written as self closing.
+ * xmldom does not have any logic for optional end tags cases and will report them as a warning.
+ * Content that would go into the unopened element will instead be added as a sibling text node.
+ *
+ * @type {Readonly<{area: boolean, col: boolean, img: boolean, wbr: boolean, link: boolean, hr: boolean, source: boolean, br: boolean, input: boolean, param: boolean, meta: boolean, embed: boolean, track: boolean, base: boolean}>}
+ * @see https://html.spec.whatwg.org/#void-elements
+ * @see https://html.spec.whatwg.org/#optional-tags
+ */
+var HTML_VOID_ELEMENTS = freeze({
+	area: true,
+	base: true,
+	br: true,
+	col: true,
+	embed: true,
+	hr: true,
+	img: true,
+	input: true,
+	link: true,
+	meta: true,
+	param: true,
+	source: true,
+	track: true,
+	wbr: true,
+})
+
+/**
+ * Check if `tagName` is matching one of the HTML void element names.
+ * This method doesn't check if such tags are allowed
+ * in the context of the current document/parsing.
+ *
+ * @param {string} tagName
+ * @return {boolean}
+ * @see HTML_VOID_ELEMENTS
+ * @see https://html.spec.whatwg.org/#void-elements
+ */
+function isHTMLVoidElement(tagName) {
+	return HTML_VOID_ELEMENTS.hasOwnProperty(tagName.toLowerCase())
+}
+
+/**
+ * Tag names that are raw text elements according to HTML spec.
+ * The value denotes whether they are escapable or not.
+ *
+ * @see isHTMLEscapableRawTextElement
+ * @see isHTMLRawTextElement
+ * @see https://html.spec.whatwg.org/#raw-text-elements
+ * @see https://html.spec.whatwg.org/#escapable-raw-text-elements
+ */
+var HTML_RAW_TEXT_ELEMENTS = freeze({
+	script: false,
+	style: false,
+	textarea: true,
+	title: true,
+})
+
+/**
+ * Check if `tagName` is matching one of the HTML raw text element names.
+ * This method doesn't check if such tags are allowed
+ * in the context of the current document/parsing.
+ *
+ * @param {string} tagName
+ * @return {boolean}
+ * @see isHTMLEscapableRawTextElement
+ * @see HTML_RAW_TEXT_ELEMENTS
+ * @see https://html.spec.whatwg.org/#raw-text-elements
+ * @see https://html.spec.whatwg.org/#escapable-raw-text-elements
+ */
+function isHTMLRawTextElement(tagName) {
+	var key = tagName.toLowerCase();
+	return HTML_RAW_TEXT_ELEMENTS.hasOwnProperty(key) && !HTML_RAW_TEXT_ELEMENTS[key];
+}
+/**
+ * Check if `tagName` is matching one of the HTML escapable raw text element names.
+ * This method doesn't check if such tags are allowed
+ * in the context of the current document/parsing.
+ *
+ * @param {string} tagName
+ * @return {boolean}
+ * @see isHTMLRawTextElement
+ * @see HTML_RAW_TEXT_ELEMENTS
+ * @see https://html.spec.whatwg.org/#raw-text-elements
+ * @see https://html.spec.whatwg.org/#escapable-raw-text-elements
+ */
+function isHTMLEscapableRawTextElement(tagName) {
+	var key = tagName.toLowerCase();
+	return HTML_RAW_TEXT_ELEMENTS.hasOwnProperty(key) && HTML_RAW_TEXT_ELEMENTS[key];
+}
+
 /**
  * All mime types that are allowed as input to `DOMParser.parseFromString`
  *
@@ -72,14 +221,32 @@ var MIME_TYPE = freeze({
 	 * @param {string} [value]
 	 * @returns {boolean}
 	 *
-	 * @see https://www.iana.org/assignments/media-types/text/html IANA MimeType registration
-	 * @see https://en.wikipedia.org/wiki/HTML Wikipedia
-	 * @see https://developer.mozilla.org/en-US/docs/Web/API/DOMParser/parseFromString MDN
-	 * @see https://html.spec.whatwg.org/multipage/dynamic-markup-insertion.html#dom-domparser-parsefromstring 	 */
+	 * @see [IANA MimeType registration](https://www.iana.org/assignments/media-types/text/html)
+	 * @see [Wikipedia](https://en.wikipedia.org/wiki/HTML)
+	 * @see [`DOMParser.parseFromString` @ MDN](https://developer.mozilla.org/en-US/docs/Web/API/DOMParser/parseFromString)
+	 * @see [`DOMParser.parseFromString` @ HTML Specification](https://html.spec.whatwg.org/multipage/dynamic-markup-insertion.html#dom-domparser-parsefromstring)
+	 */
 	isHTML: function (value) {
 		return value === MIME_TYPE.HTML
 	},
 
+	/**
+	 * For both the `text/html` and the `application/xhtml+xml` namespace
+	 * the spec defines that the HTML namespace is provided as the default in some cases.
+	 *
+	 * @param {string} mimeType
+	 * @returns {boolean}
+	 *
+	 * @see https://dom.spec.whatwg.org/#dom-document-createelement
+	 * @see https://dom.spec.whatwg.org/#dom-domimplementation-createdocument
+	 * @see https://dom.spec.whatwg.org/#dom-domimplementation-createhtmldocument
+	 */
+	hasDefaultHTMLNamespace: function (mimeType) {
+		return (
+			MIME_TYPE.isHTML(mimeType) || mimeType === MIME_TYPE.XML_XHTML_APPLICATION
+		)
+	},
+
 	/**
 	 * `application/xml`, the standard mime type for XML documents.
 	 *
@@ -164,7 +331,14 @@ var NAMESPACE = freeze({
 	XMLNS: 'http://www.w3.org/2000/xmlns/',
 })
 
-exports.assign = assign;
-exports.freeze = freeze;
-exports.MIME_TYPE = MIME_TYPE;
-exports.NAMESPACE = NAMESPACE;
+exports.assign = assign
+exports.freeze = freeze
+exports.HTML_BOOLEAN_ATTRIBUTES = HTML_BOOLEAN_ATTRIBUTES
+exports.HTML_RAW_TEXT_ELEMENTS = HTML_RAW_TEXT_ELEMENTS
+exports.HTML_VOID_ELEMENTS = HTML_VOID_ELEMENTS
+exports.isHTMLBooleanAttribute = isHTMLBooleanAttribute
+exports.isHTMLRawTextElement = isHTMLRawTextElement
+exports.isHTMLEscapableRawTextElement = isHTMLEscapableRawTextElement
+exports.isHTMLVoidElement = isHTMLVoidElement
+exports.MIME_TYPE = MIME_TYPE
+exports.NAMESPACE = NAMESPACE