Skip to content

Commit

Permalink
feat(xml/parser): CDataSection parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-akait committed Nov 28, 2022
1 parent 9785bff commit 1ad8a07
Show file tree
Hide file tree
Showing 24 changed files with 829 additions and 234 deletions.
8 changes: 4 additions & 4 deletions crates/swc_xml_ast/src/base.rs
Expand Up @@ -29,8 +29,8 @@ pub enum Child {
Element(Element),
#[tag("Text")]
Text(Text),
#[tag("CDATASection")]
CDATASection(CDATASection),
#[tag("CdataSection")]
CdataSection(CdataSection),
#[tag("Comment")]
Comment(Comment),
#[tag("ProcessingInstruction")]
Expand Down Expand Up @@ -92,9 +92,9 @@ pub struct Text {
pub raw: Option<JsWord>,
}

#[ast_node("CDATASection")]
#[ast_node("CdataSection")]
#[derive(Eq, Hash, EqIgnoreSpan)]
pub struct CDATASection {
pub struct CdataSection {
pub span: Span,
pub data: JsWord,
pub raw: Option<JsWord>,
Expand Down
2 changes: 1 addition & 1 deletion crates/swc_xml_ast/src/token.rs
Expand Up @@ -60,7 +60,7 @@ pub enum Token {
target: JsWord,
data: JsWord,
},
CData {
Cdata {
data: JsWord,
raw: JsWord,
},
Expand Down
12 changes: 12 additions & 0 deletions crates/swc_xml_codegen/src/lib.rs
Expand Up @@ -62,6 +62,7 @@ where
Child::Text(n) => emit!(self, n),
Child::Comment(n) => emit!(self, n),
Child::ProcessingInstruction(n) => emit!(self, n),
Child::CdataSection(n) => emit!(self, n),
}
}

Expand Down Expand Up @@ -289,6 +290,17 @@ where
newline!(self);
}

#[emitter]
fn emit_cdata_section(&mut self, n: &CdataSection) -> Result {
let mut cdata_section = String::with_capacity(n.data.len() + 12);

cdata_section.push_str("<![CDATA[");
cdata_section.push_str(&n.data);
cdata_section.push_str("]]>");

write_multiline_raw!(self, n.span, &cdata_section);
}

fn create_context_for_element(&self, n: &Element) -> Ctx {
let need_escape_text = match &*n.tag_name {
"noscript" => !self.config.scripting_enabled,
Expand Down
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/input.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/output.min.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
25 changes: 25 additions & 0 deletions crates/swc_xml_codegen/tests/fixture/cdata_section/output.xml
@@ -0,0 +1,25 @@
<root>
<description>An example of escaped CENDs</description>
<!-- This text contains a CEND ]]> -->
<!-- In this first case we put the ]] at the end of the first CDATA block
and the > in the second CDATA block -->
<exampleOfACDATA>
<![CDATA[
Since this is a CDATA section
I can use all sorts of reserved characters
but my document is still well formed!
]]>
</exampleOfACDATA>
<p><![CDATA[<greeting>Hello, world!</greeting>]]></p>
<p><![CDATA[content]]></p>
<p><![CDATA[&amping]]></p>
<p><![CDATA[&amping ]]]></p>
<p><![CDATA[&amping]] ]]></p>
<p><![CDATA[<message>text</message>]]></p>
<p><![CDATA[</this is malformed!</malformed</malformed & worse>]]></p>
<p><![CDATA[1]]><![CDATA[2]]></p>
<p>
<![CDATA[data]]> </p>
<p><![CDATA[bracket ]after]]></p>
<p><![CDATA[]]></p>
</root>
79 changes: 58 additions & 21 deletions crates/swc_xml_parser/src/lexer/mod.rs
Expand Up @@ -110,6 +110,12 @@ struct ProcessingInstruction {
data: String,
}

#[derive(PartialEq, Eq, Clone, Debug)]
struct Cdata {
data: String,
raw: String,
}

pub(crate) type LexResult<T> = Result<T, ErrorKind>;

pub struct Lexer<I>
Expand All @@ -127,11 +133,11 @@ where
additional_allowed_character: Option<char>,
pending_tokens: VecDeque<TokenAndSpan>,
doctype_raw: Option<String>,
cdata_raw: Option<String>,
current_doctype_token: Option<Doctype>,
current_comment_token: Option<Comment>,
current_processing_instruction: Option<ProcessingInstruction>,
current_tag_token: Option<Tag>,
current_cdata_token: Option<Cdata>,
attribute_start_position: Option<BytePos>,
}

Expand All @@ -154,11 +160,11 @@ where
additional_allowed_character: None,
pending_tokens: VecDeque::new(),
doctype_raw: None,
cdata_raw: None,
current_doctype_token: None,
current_comment_token: None,
current_processing_instruction: None,
current_tag_token: None,
current_cdata_token: None,
attribute_start_position: None,
};

Expand Down Expand Up @@ -863,6 +869,34 @@ where
});
}

fn create_cdata_token(&mut self) {
let data = String::new();
let raw = String::with_capacity(12);

self.current_cdata_token = Some(Cdata { data, raw });
}

fn append_to_cdata_token(&mut self, c: Option<char>, raw_c: Option<char>) {
if let Some(Cdata { data, raw }) = &mut self.current_cdata_token {
if let Some(c) = c {
data.push(c);
}

if let Some(raw_c) = raw_c {
raw.push(raw_c);
}
}
}

fn emit_cdata_token(&mut self) {
let cdata = self.current_cdata_token.take().unwrap();

self.emit_token(Token::Cdata {
data: cdata.data.into(),
raw: cdata.raw.into(),
});
}

fn handle_raw_and_emit_character_token(&mut self, c: char) {
let is_cr = c == '\r';

Expand Down Expand Up @@ -1364,17 +1398,16 @@ where
Some(t @ 'T') => match self.consume_next_char() {
Some(a2 @ 'A') => match self.consume_next_char() {
Some('[') => {
let mut data = String::with_capacity(7);

data.push('[');
data.push(c);
data.push(d);
data.push(a1);
data.push(t);
data.push(a2);
data.push('[');

self.cdata_raw = Some(data);
self.create_cdata_token();
self.append_to_cdata_token(None, Some('<'));
self.append_to_cdata_token(None, Some('!'));
self.append_to_cdata_token(None, Some('['));
self.append_to_cdata_token(None, Some(c));
self.append_to_cdata_token(None, Some(d));
self.append_to_cdata_token(None, Some(a1));
self.append_to_cdata_token(None, Some(t));
self.append_to_cdata_token(None, Some(a2));
self.append_to_cdata_token(None, Some('['));
self.state = State::Cdata;
}
_ => {
Expand Down Expand Up @@ -1697,11 +1730,11 @@ where
self.reconsume_in_state(State::Data);
}
// Anything else
// Emit the current input character as character token. Stay in the current
// Append the current input character to the cdata dta. Stay in the current
// state.
Some(c) => {
self.validate_input_stream_character(c);
self.handle_raw_and_emit_character_token(c);
self.append_to_cdata_token(Some(c), Some(c));
}
}
}
Expand All @@ -1723,8 +1756,8 @@ where
// Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the
// CDATA section state.
Some(c) => {
self.emit_character_token((']', ']'));
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(c), Some(c));
self.state = State::Cdata;
}
}
Expand All @@ -1735,13 +1768,17 @@ where
// U+003E GREATER-THAN SIGN (>)
// Switch to the data state.
Some('>') => {
self.append_to_cdata_token(None, Some(']'));
self.append_to_cdata_token(None, Some(']'));
self.append_to_cdata_token(None, Some('>'));
self.emit_cdata_token();
self.state = State::Data;
}
// U+005D RIGHT SQUARE BRACKET (])
// Emit the current input character as character token. Stay in the current
// state.
Some(c @ ']') => {
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(c), Some(c));
}
// EOF
// Parse error. Reconsume the current input character in the data state.
Expand All @@ -1754,9 +1791,9 @@ where
// also emit the current input character as character token. Switch to the CDATA
// state.
Some(c) => {
self.emit_character_token((']', ']'));
self.emit_character_token((']', ']'));
self.emit_character_token((c, c));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(']'), Some(']'));
self.append_to_cdata_token(Some(c), Some(c));
self.state = State::Cdata;
}
}
Expand Down
45 changes: 45 additions & 0 deletions crates/swc_xml_parser/src/parser/mod.rs
Expand Up @@ -188,6 +188,11 @@ where
data,
})
}
Data::CdataSection { data, raw } => Child::CdataSection(CdataSection {
span: start_span,
data,
raw,
}),
_ => {
unreachable!();
}
Expand Down Expand Up @@ -267,6 +272,14 @@ where
Token::ProcessingInstruction { .. } => {
self.append_processing_instruction_to_doc(token_and_info)?;
}
Token::Cdata { .. } => {
self.errors.push(Error::new(
token_and_info.span,
ErrorKind::UnexpectedTokenInStartPhase,
));

self.append_cdata_to_doc(token_and_info)?;
}
Token::Character { value, .. } => {
if !is_whitespace(*value) {
self.errors.push(Error::new(
Expand Down Expand Up @@ -354,6 +367,11 @@ where

self.append_node(self.get_current_element(), processing_instruction);
}
Token::Cdata { .. } => {
let cdata = self.create_cdata_section(token_and_info);

self.append_node(self.get_current_element(), cdata);
}
Token::Eof => {
self.errors.push(Error::new(
token_and_info.span,
Expand All @@ -376,6 +394,14 @@ where
Token::ProcessingInstruction { .. } => {
self.append_processing_instruction_to_doc(token_and_info)?;
}
Token::Cdata { .. } => {
self.errors.push(Error::new(
token_and_info.span,
ErrorKind::UnexpectedTokenInEndPhase,
));

self.append_cdata_to_doc(token_and_info)?;
}
Token::Character { value, .. } => {
if !is_whitespace(*value) {
self.errors.push(Error::new(
Expand Down Expand Up @@ -603,6 +629,25 @@ where
Ok(())
}

fn create_cdata_section(&self, token_and_info: &mut TokenAndInfo) -> RcNode {
let (data, raw) = match &token_and_info.token {
Token::Cdata { data, raw } => (data.clone(), Some(raw.clone())),
_ => {
unreachable!()
}
};

Node::new(Data::CdataSection { data, raw }, token_and_info.span)
}

fn append_cdata_to_doc(&mut self, token_and_info: &mut TokenAndInfo) -> PResult<()> {
let child = self.create_cdata_section(token_and_info);

self.append_node(self.document.as_ref().unwrap(), child);

Ok(())
}

fn update_end_tag_span(&self, node: Option<&RcNode>, span: Span) {
if let Some(node) = node {
if node.start_span.borrow().is_dummy() {
Expand Down
4 changes: 4 additions & 0 deletions crates/swc_xml_parser/src/parser/node.rs
Expand Up @@ -36,6 +36,10 @@ pub enum Data {
target: JsWord,
data: JsWord,
},
CdataSection {
data: JsWord,
raw: Option<JsWord>,
},
Comment {
data: JsWord,
raw: Option<JsWord>,
Expand Down

0 comments on commit 1ad8a07

Please sign in to comment.