Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(xml/parser): parse cdata section #6531

Merged
merged 6 commits into from Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions crates/swc_xml_ast/src/base.rs
Expand Up @@ -29,6 +29,8 @@ pub enum Child {
Element(Element),
#[tag("Text")]
Text(Text),
#[tag("CdataSection")]
CdataSection(CdataSection),
#[tag("Comment")]
Comment(Comment),
#[tag("ProcessingInstruction")]
Expand Down Expand Up @@ -90,6 +92,14 @@ pub struct Text {
pub raw: Option<JsWord>,
}

#[ast_node("CdataSection")]
#[derive(Eq, Hash, EqIgnoreSpan)]
pub struct CdataSection {
pub span: Span,
pub data: JsWord,
pub raw: Option<JsWord>,
}

#[ast_node("ProcessingInstruction")]
#[derive(Eq, Hash, EqIgnoreSpan)]
pub struct ProcessingInstruction {
Expand Down
4 changes: 4 additions & 0 deletions crates/swc_xml_ast/src/token.rs
Expand Up @@ -60,5 +60,9 @@ pub enum Token {
target: JsWord,
data: JsWord,
},
Cdata {
data: JsWord,
raw: JsWord,
},
Eof,
}
12 changes: 12 additions & 0 deletions crates/swc_xml_codegen/src/lib.rs
Expand Up @@ -62,6 +62,7 @@ where
Child::Text(n) => emit!(self, n),
Child::Comment(n) => emit!(self, n),
Child::ProcessingInstruction(n) => emit!(self, n),
Child::CdataSection(n) => emit!(self, n),
}
}

Expand Down Expand Up @@ -287,6 +288,17 @@ where
write_multiline_raw!(self, n.span, &processing_instruction);
}

#[emitter]
fn emit_cdata_section(&mut self, n: &CdataSection) -> Result {
let mut cdata_section = String::with_capacity(n.data.len() + 12);

cdata_section.push_str("<![CDATA[");
cdata_section.push_str(&n.data);
cdata_section.push_str("]]>");

write_multiline_raw!(self, n.span, &cdata_section);
}

fn create_context_for_element(&self, n: &Element) -> Ctx {
let need_escape_text = match &*n.tag_name {
"noscript" => !self.config.scripting_enabled,
Expand Down
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/input.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/output.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
81 changes: 59 additions & 22 deletions crates/swc_xml_parser/src/lexer/mod.rs
Expand Up @@ -111,6 +111,12 @@ struct ProcessingInstruction {
data: String,
}

#[derive(PartialEq, Eq, Clone, Debug)]
struct Cdata {
data: String,
raw: String,
}

pub(crate) type LexResult<T> = Result<T, ErrorKind>;

pub struct Lexer<I>
Expand All @@ -128,11 +134,11 @@ where
additional_allowed_character: Option<char>,
pending_tokens: VecDeque<TokenAndSpan>,
doctype_raw: Option<String>,
cdata_raw: Option<String>,
current_doctype_token: Option<Doctype>,
current_comment_token: Option<Comment>,
current_processing_instruction: Option<ProcessingInstruction>,
current_tag_token: Option<Tag>,
current_cdata_token: Option<Cdata>,
attribute_start_position: Option<BytePos>,
}

Expand All @@ -155,11 +161,11 @@ where
additional_allowed_character: None,
pending_tokens: VecDeque::new(),
doctype_raw: None,
cdata_raw: None,
current_doctype_token: None,
current_comment_token: None,
current_processing_instruction: None,
current_tag_token: None,
current_cdata_token: None,
attribute_start_position: None,
};

Expand Down Expand Up @@ -864,6 +870,34 @@ where
});
}

fn create_cdata_token(&mut self) {
let data = String::new();
let raw = String::with_capacity(12);

self.current_cdata_token = Some(Cdata { data, raw });
}

fn append_to_cdata_token(&mut self, c: Option<char>, raw_c: Option<char>) {
if let Some(Cdata { data, raw }) = &mut self.current_cdata_token {
if let Some(c) = c {
data.push(c);
}

if let Some(raw_c) = raw_c {
raw.push(raw_c);
}
}
}

fn emit_cdata_token(&mut self) {
let cdata = self.current_cdata_token.take().unwrap();

self.emit_token(Token::Cdata {
data: cdata.data.into(),
raw: cdata.raw.into(),
});
}

fn handle_raw_and_emit_character_token(&mut self, c: char) {
let is_cr = c == '\r';

Expand Down Expand Up @@ -1400,17 +1434,16 @@ where
Some(t @ 'T') => match self.consume_next_char() {
Some(a2 @ 'A') => match self.consume_next_char() {
Some('[') => {
let mut data = String::with_capacity(7);

data.push('[');
data.push(c);
data.push(d);
data.push(a1);
data.push(t);
data.push(a2);
data.push('[');

self.cdata_raw = Some(data);
self.create_cdata_token();
self.append_to_cdata_token(None, Some('<'));
self.append_to_cdata_token(None, Some('!'));
self.append_to_cdata_token(None, Some('['));
self.append_to_cdata_token(None, Some(c));
self.append_to_cdata_token(None, Some(d));
self.append_to_cdata_token(None, Some(a1));
self.append_to_cdata_token(None, Some(t));
self.append_to_cdata_token(None, Some(a2));
self.append_to_cdata_token(None, Some('['));
self.state = State::Cdata;
}
_ => {
Expand Down Expand Up @@ -1734,11 +1767,11 @@ where
self.reconsume_in_state(State::Data);
}
// Anything else
// Emit the current input character as character token. Stay in the current
// Append the current input character to the cdata dta. Stay in the current
// state.
Some(c) => {
self.validate_input_stream_character(c);
self.handle_raw_and_emit_character_token(c);
self.append_to_cdata_token(Some(c), Some(c));
}
}
}
Expand All @@ -1760,9 +1793,9 @@ where
// Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
// CDATA section state.
Some(c) => {
self.emit_character_token((']', ']'));
self.emit_character_token((c, c));
self.reconsume_in_state(State::Cdata);
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(c), Some(c));
self.state = State::Cdata;
}
}
}
Expand All @@ -1772,13 +1805,17 @@ where
// U+003E GREATER-THAN SIGN (>)
// Switch to the data state.
Some('>') => {
self.append_to_cdata_token(None, Some(']'));
self.append_to_cdata_token(None, Some(']'));
self.append_to_cdata_token(None, Some('>'));
self.emit_cdata_token();
self.state = State::Data;
}
// U+005D RIGHT SQUARE BRACKET (])
// Emit the current input character as character token. Stay in the current
// state.
Some(c @ ']') => {
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(c), Some(c));
}
// EOF
// Parse error. Reconsume the current input character in the data state.
Expand All @@ -1791,9 +1828,9 @@ where
// also emit the current input character as character token. Switch to the CDATA
// state.
Some(c) => {
self.emit_character_token((']', ']'));
self.emit_character_token((']', ']'));
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(c), Some(c));
self.state = State::Cdata;
}
}
Expand Down
45 changes: 45 additions & 0 deletions crates/swc_xml_parser/src/parser/mod.rs
Expand Up @@ -188,6 +188,11 @@ where
data,
})
}
Data::CdataSection { data, raw } => Child::CdataSection(CdataSection {
span: start_span,
data,
raw,
}),
_ => {
unreachable!();
}
Expand Down Expand Up @@ -267,6 +272,14 @@ where
Token::ProcessingInstruction { .. } => {
self.append_processing_instruction_to_doc(token_and_info)?;
}
Token::Cdata { .. } => {
self.errors.push(Error::new(
token_and_info.span,
ErrorKind::UnexpectedTokenInStartPhase,
));

self.append_cdata_to_doc(token_and_info)?;
}
Token::Character { value, .. } => {
if !is_whitespace(*value) {
self.errors.push(Error::new(
Expand Down Expand Up @@ -354,6 +367,11 @@ where

self.append_node(self.get_current_element(), processing_instruction);
}
Token::Cdata { .. } => {
let cdata = self.create_cdata_section(token_and_info);

self.append_node(self.get_current_element(), cdata);
}
Token::Eof => {
self.errors.push(Error::new(
token_and_info.span,
Expand All @@ -376,6 +394,14 @@ where
Token::ProcessingInstruction { .. } => {
self.append_processing_instruction_to_doc(token_and_info)?;
}
Token::Cdata { .. } => {
self.errors.push(Error::new(
token_and_info.span,
ErrorKind::UnexpectedTokenInEndPhase,
));

self.append_cdata_to_doc(token_and_info)?;
}
Token::Character { value, .. } => {
if !is_whitespace(*value) {
self.errors.push(Error::new(
Expand Down Expand Up @@ -603,6 +629,25 @@ where
Ok(())
}

fn create_cdata_section(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
let (data, raw) = match &token_and_info.token {
Token::Cdata { data, raw } => (data.clone(), Some(raw.clone())),
_ => {
unreachable!()
}
};

Node::new(Data::CdataSection { data, raw }, token_and_info.span)
}

fn append_cdata_to_doc(&mut self, token_and_info: &mut TokenAndInfo) -> PResult<()> {
let child = self.create_cdata_section(token_and_info);

self.append_node(self.document.as_ref().unwrap(), child);

Ok(())
}

fn update_end_tag_span(&self, node: Option<&RcNode>, span: Span) {
if let Some(node) = node {
if node.start_span.borrow().is_dummy() {
Expand Down
4 changes: 4 additions & 0 deletions crates/swc_xml_parser/src/parser/node.rs
Expand Up @@ -36,6 +36,10 @@ pub enum Data {
target: JsWord,
data: JsWord,
},
CdataSection {
data: JsWord,
raw: Option<JsWord>,
},
Comment {
data: JsWord,
raw: Option<JsWord>,
Expand Down