Skip to content

Commit

Permalink
feat(xml/parser): Parse cdata section (#6531)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-akait committed Nov 29, 2022
1 parent 180f674 commit 7f8c4e0
Show file tree
Hide file tree
Showing 27 changed files with 1,969 additions and 421 deletions.
10 changes: 10 additions & 0 deletions crates/swc_xml_ast/src/base.rs
Expand Up @@ -29,6 +29,8 @@ pub enum Child {
Element(Element),
#[tag("Text")]
Text(Text),
#[tag("CdataSection")]
CdataSection(CdataSection),
#[tag("Comment")]
Comment(Comment),
#[tag("ProcessingInstruction")]
Expand Down Expand Up @@ -90,6 +92,14 @@ pub struct Text {
pub raw: Option<JsWord>,
}

#[ast_node("CdataSection")]
#[derive(Eq, Hash, EqIgnoreSpan)]
pub struct CdataSection {
pub span: Span,
pub data: JsWord,
pub raw: Option<JsWord>,
}

#[ast_node("ProcessingInstruction")]
#[derive(Eq, Hash, EqIgnoreSpan)]
pub struct ProcessingInstruction {
Expand Down
4 changes: 4 additions & 0 deletions crates/swc_xml_ast/src/token.rs
Expand Up @@ -60,5 +60,9 @@ pub enum Token {
target: JsWord,
data: JsWord,
},
Cdata {
data: JsWord,
raw: JsWord,
},
Eof,
}
12 changes: 12 additions & 0 deletions crates/swc_xml_codegen/src/lib.rs
Expand Up @@ -62,6 +62,7 @@ where
Child::Text(n) => emit!(self, n),
Child::Comment(n) => emit!(self, n),
Child::ProcessingInstruction(n) => emit!(self, n),
Child::CdataSection(n) => emit!(self, n),
}
}

Expand Down Expand Up @@ -287,6 +288,17 @@ where
write_multiline_raw!(self, n.span, &processing_instruction);
}

#[emitter]
fn emit_cdata_section(&mut self, n: &CdataSection) -> Result {
let mut cdata_section = String::with_capacity(n.data.len() + 12);

cdata_section.push_str("<![CDATA[");
cdata_section.push_str(&n.data);
cdata_section.push_str("]]>");

write_multiline_raw!(self, n.span, &cdata_section);
}

fn create_context_for_element(&self, n: &Element) -> Ctx {
let need_escape_text = match &*n.tag_name {
"noscript" => !self.config.scripting_enabled,
Expand Down
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/input.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/output.min.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/output.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
81 changes: 59 additions & 22 deletions crates/swc_xml_parser/src/lexer/mod.rs
Expand Up @@ -111,6 +111,12 @@ struct ProcessingInstruction {
data: String,
}

#[derive(PartialEq, Eq, Clone, Debug)]
struct Cdata {
data: String,
raw: String,
}

pub(crate) type LexResult<T> = Result<T, ErrorKind>;

pub struct Lexer<I>
Expand All @@ -128,11 +134,11 @@ where
additional_allowed_character: Option<char>,
pending_tokens: VecDeque<TokenAndSpan>,
doctype_raw: Option<String>,
cdata_raw: Option<String>,
current_doctype_token: Option<Doctype>,
current_comment_token: Option<Comment>,
current_processing_instruction: Option<ProcessingInstruction>,
current_tag_token: Option<Tag>,
current_cdata_token: Option<Cdata>,
attribute_start_position: Option<BytePos>,
}

Expand All @@ -155,11 +161,11 @@ where
additional_allowed_character: None,
pending_tokens: VecDeque::new(),
doctype_raw: None,
cdata_raw: None,
current_doctype_token: None,
current_comment_token: None,
current_processing_instruction: None,
current_tag_token: None,
current_cdata_token: None,
attribute_start_position: None,
};

Expand Down Expand Up @@ -864,6 +870,34 @@ where
});
}

fn create_cdata_token(&mut self) {
let data = String::new();
let raw = String::with_capacity(12);

self.current_cdata_token = Some(Cdata { data, raw });
}

fn append_to_cdata_token(&mut self, c: Option<char>, raw_c: Option<char>) {
if let Some(Cdata { data, raw }) = &mut self.current_cdata_token {
if let Some(c) = c {
data.push(c);
}

if let Some(raw_c) = raw_c {
raw.push(raw_c);
}
}
}

fn emit_cdata_token(&mut self) {
let cdata = self.current_cdata_token.take().unwrap();

self.emit_token(Token::Cdata {
data: cdata.data.into(),
raw: cdata.raw.into(),
});
}

fn handle_raw_and_emit_character_token(&mut self, c: char) {
let is_cr = c == '\r';

Expand Down Expand Up @@ -1400,17 +1434,16 @@ where
Some(t @ 'T') => match self.consume_next_char() {
Some(a2 @ 'A') => match self.consume_next_char() {
Some('[') => {
let mut data = String::with_capacity(7);

data.push('[');
data.push(c);
data.push(d);
data.push(a1);
data.push(t);
data.push(a2);
data.push('[');

self.cdata_raw = Some(data);
self.create_cdata_token();
self.append_to_cdata_token(None, Some('<'));
self.append_to_cdata_token(None, Some('!'));
self.append_to_cdata_token(None, Some('['));
self.append_to_cdata_token(None, Some(c));
self.append_to_cdata_token(None, Some(d));
self.append_to_cdata_token(None, Some(a1));
self.append_to_cdata_token(None, Some(t));
self.append_to_cdata_token(None, Some(a2));
self.append_to_cdata_token(None, Some('['));
self.state = State::Cdata;
}
_ => {
Expand Down Expand Up @@ -1734,11 +1767,11 @@ where
self.reconsume_in_state(State::Data);
}
// Anything else
// Emit the current input character as character token. Stay in the current
// Append the current input character to the cdata dta. Stay in the current
// state.
Some(c) => {
self.validate_input_stream_character(c);
self.handle_raw_and_emit_character_token(c);
self.append_to_cdata_token(Some(c), Some(c));
}
}
}
Expand All @@ -1760,9 +1793,9 @@ where
// Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
// CDATA section state.
Some(c) => {
self.emit_character_token((']', ']'));
self.emit_character_token((c, c));
self.reconsume_in_state(State::Cdata);
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(c), Some(c));
self.state = State::Cdata;
}
}
}
Expand All @@ -1772,13 +1805,17 @@ where
// U+003E GREATER-THAN SIGN (>)
// Switch to the data state.
Some('>') => {
self.append_to_cdata_token(None, Some(']'));
self.append_to_cdata_token(None, Some(']'));
self.append_to_cdata_token(None, Some('>'));
self.emit_cdata_token();
self.state = State::Data;
}
// U+005D RIGHT SQUARE BRACKET (])
// Emit the current input character as character token. Stay in the current
// state.
Some(c @ ']') => {
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(c), Some(c));
}
// EOF
// Parse error. Reconsume the current input character in the data state.
Expand All @@ -1791,9 +1828,9 @@ where
// also emit the current input character as character token. Switch to the CDATA
// state.
Some(c) => {
self.emit_character_token((']', ']'));
self.emit_character_token((']', ']'));
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(c), Some(c));
self.state = State::Cdata;
}
}
Expand Down
45 changes: 45 additions & 0 deletions crates/swc_xml_parser/src/parser/mod.rs
Expand Up @@ -188,6 +188,11 @@ where
data,
})
}
Data::CdataSection { data, raw } => Child::CdataSection(CdataSection {
span: start_span,
data,
raw,
}),
_ => {
unreachable!();
}
Expand Down Expand Up @@ -267,6 +272,14 @@ where
Token::ProcessingInstruction { .. } => {
self.append_processing_instruction_to_doc(token_and_info)?;
}
Token::Cdata { .. } => {
self.errors.push(Error::new(
token_and_info.span,
ErrorKind::UnexpectedTokenInStartPhase,
));

self.append_cdata_to_doc(token_and_info)?;
}
Token::Character { value, .. } => {
if !is_whitespace(*value) {
self.errors.push(Error::new(
Expand Down Expand Up @@ -354,6 +367,11 @@ where

self.append_node(self.get_current_element(), processing_instruction);
}
Token::Cdata { .. } => {
let cdata = self.create_cdata_section(token_and_info);

self.append_node(self.get_current_element(), cdata);
}
Token::Eof => {
self.errors.push(Error::new(
token_and_info.span,
Expand All @@ -376,6 +394,14 @@ where
Token::ProcessingInstruction { .. } => {
self.append_processing_instruction_to_doc(token_and_info)?;
}
Token::Cdata { .. } => {
self.errors.push(Error::new(
token_and_info.span,
ErrorKind::UnexpectedTokenInEndPhase,
));

self.append_cdata_to_doc(token_and_info)?;
}
Token::Character { value, .. } => {
if !is_whitespace(*value) {
self.errors.push(Error::new(
Expand Down Expand Up @@ -603,6 +629,25 @@ where
Ok(())
}

fn create_cdata_section(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
let (data, raw) = match &token_and_info.token {
Token::Cdata { data, raw } => (data.clone(), Some(raw.clone())),
_ => {
unreachable!()
}
};

Node::new(Data::CdataSection { data, raw }, token_and_info.span)
}

fn append_cdata_to_doc(&mut self, token_and_info: &mut TokenAndInfo) -> PResult<()> {
let child = self.create_cdata_section(token_and_info);

self.append_node(self.document.as_ref().unwrap(), child);

Ok(())
}

fn update_end_tag_span(&self, node: Option<&RcNode>, span: Span) {
if let Some(node) = node {
if node.start_span.borrow().is_dummy() {
Expand Down
4 changes: 4 additions & 0 deletions crates/swc_xml_parser/src/parser/node.rs
Expand Up @@ -36,6 +36,10 @@ pub enum Data {
target: JsWord,
data: JsWord,
},
CdataSection {
data: JsWord,
raw: Option<JsWord>,
},
Comment {
data: JsWord,
raw: Option<JsWord>,
Expand Down

1 comment on commit 7f8c4e0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmark

Benchmark suite Current: 7f8c4e0 Previous: 81224b5 Ratio
es/full/bugs-1 341810 ns/iter (± 28980) 365328 ns/iter (± 33945) 0.94
es/full/minify/libraries/antd 2018981038 ns/iter (± 20723901) 2107189944 ns/iter (± 128191799) 0.96
es/full/minify/libraries/d3 442203065 ns/iter (± 16064490) 470107055 ns/iter (± 35088197) 0.94
es/full/minify/libraries/echarts 1703355564 ns/iter (± 26324547) 1774468471 ns/iter (± 79040254) 0.96
es/full/minify/libraries/jquery 110845356 ns/iter (± 5169149) 124719297 ns/iter (± 5571075) 0.89
es/full/minify/libraries/lodash 132277049 ns/iter (± 2967923) 145008154 ns/iter (± 15196825) 0.91
es/full/minify/libraries/moment 66907887 ns/iter (± 3036705) 70419868 ns/iter (± 6250800) 0.95
es/full/minify/libraries/react 22176430 ns/iter (± 305040) 26011022 ns/iter (± 8917317) 0.85
es/full/minify/libraries/terser 357419993 ns/iter (± 9563367) 374857588 ns/iter (± 20216458) 0.95
es/full/minify/libraries/three 643875030 ns/iter (± 17605416) 674491467 ns/iter (± 60117562) 0.95
es/full/minify/libraries/typescript 3728175153 ns/iter (± 42866771) 3929072169 ns/iter (± 103581545) 0.95
es/full/minify/libraries/victory 915345212 ns/iter (± 17857521) 947208804 ns/iter (± 53613068) 0.97
es/full/minify/libraries/vue 179418799 ns/iter (± 4200803) 186362935 ns/iter (± 10012014) 0.96
es/full/codegen/es3 33808 ns/iter (± 896) 33974 ns/iter (± 632) 1.00
es/full/codegen/es5 33883 ns/iter (± 2293) 33877 ns/iter (± 1046) 1.00
es/full/codegen/es2015 33844 ns/iter (± 432) 33961 ns/iter (± 1526) 1.00
es/full/codegen/es2016 33812 ns/iter (± 633) 33852 ns/iter (± 690) 1.00
es/full/codegen/es2017 33762 ns/iter (± 1203) 33934 ns/iter (± 2102) 0.99
es/full/codegen/es2018 33827 ns/iter (± 873) 33910 ns/iter (± 828) 1.00
es/full/codegen/es2019 33822 ns/iter (± 950) 33792 ns/iter (± 827) 1.00
es/full/codegen/es2020 33836 ns/iter (± 1828) 34074 ns/iter (± 2069) 0.99
es/full/all/es3 191349752 ns/iter (± 6917198) 216390998 ns/iter (± 25768620) 0.88
es/full/all/es5 176588373 ns/iter (± 11646973) 205628084 ns/iter (± 17480324) 0.86
es/full/all/es2015 142249333 ns/iter (± 10842831) 166888123 ns/iter (± 10837340) 0.85
es/full/all/es2016 142040995 ns/iter (± 4096314) 152641547 ns/iter (± 15029279) 0.93
es/full/all/es2017 144454669 ns/iter (± 14462038) 165712110 ns/iter (± 21696277) 0.87
es/full/all/es2018 149065172 ns/iter (± 12788899) 153998771 ns/iter (± 11429859) 0.97
es/full/all/es2019 152736481 ns/iter (± 15570807) 159882719 ns/iter (± 11490676) 0.96
es/full/all/es2020 130389675 ns/iter (± 4044322) 148746533 ns/iter (± 8774768) 0.88
es/full/parser 684487 ns/iter (± 23236) 728824 ns/iter (± 65214) 0.94
es/full/base/fixer 25079 ns/iter (± 748) 26502 ns/iter (± 1685) 0.95
es/full/base/resolver_and_hygiene 89057 ns/iter (± 2503) 94321 ns/iter (± 9653) 0.94
serialization of ast node 214 ns/iter (± 3) 219 ns/iter (± 14) 0.98
serialization of serde 231 ns/iter (± 7) 232 ns/iter (± 16) 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.