Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement differential fuzzer for pandoc #673

Draft
wants to merge 26 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
370ac6b
Initial math spec
ollpu Oct 16, 2023
e10c5de
Yet another Math implementation
notriddle Oct 19, 2023
e2bcc97
Clean up some minor nits from code review
notriddle Oct 21, 2023
b2cf23d
Use a better parsing strategy for display
notriddle Oct 21, 2023
ba69c64
Set "reasonable" limits for brace nesting
notriddle Oct 21, 2023
552e0d1
Clean up (and add another test) for `$$x$x$$`
notriddle Oct 22, 2023
d47ce0c
Stop clear()ing the map after successful matches
notriddle Oct 22, 2023
8a503c9
Use convenience function
notriddle Oct 25, 2023
747d923
Clean up indentation
notriddle Oct 25, 2023
46f5da4
Adapt tests for html5ever normalization being removed
notriddle Mar 4, 2024
ded8707
Fix parsing power bug where math doesn't block lists
notriddle Mar 4, 2024
39fbe58
Avoid incorrect brace matching on `$inline$$`-shaped problems
notriddle Mar 4, 2024
e771f5d
Align with the current version of commonmark-hs
notriddle Mar 5, 2024
bfeadd5
Avoid some incorrect treatment of display math lookups
notriddle Mar 5, 2024
80fd33f
Improve brace overflow heuristics
notriddle Mar 5, 2024
c405ef4
Fix bugs in handling of block structure
notriddle Mar 5, 2024
2cacc65
Add `math-` to HTML classes `inline` and `display`
notriddle Mar 11, 2024
875cf51
docs: minor docstring error in display math variant
Martin1887 Mar 23, 2024
13ddab0
chore: minor comment fix
Martin1887 Mar 23, 2024
e91b65c
fix(clippy): <= 0 comparison in a usize
Martin1887 Mar 23, 2024
a278054
Fix headers in math.txt
notriddle Mar 23, 2024
b547d00
Keep populating math_delims after invalid delim
ollpu Mar 25, 2024
48aed10
Math refactor
ollpu Mar 25, 2024
3d8bb1f
Disambiguate regression test
ollpu Mar 25, 2024
a59f6ea
Pandoc fuzzer
notriddle Mar 5, 2024
4e02fac
Update lockfile
notriddle Apr 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
852 changes: 848 additions & 4 deletions fuzz/Cargo.lock

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pretty_assertions = "1.3.0"
quick-xml = "0.29"
mozjs = { git = "https://github.com/notriddle/mozjs", features = ["streams"] }
urlencoding = "2.1.2"
serde_json = "1.0.114"
serde = { version = "1", features = ["derive"] }
reqwest = { version = "0.11.24", features = ["blocking", "json"] }

[dependencies.pulldown-cmark]
path = "../pulldown-cmark"
Expand All @@ -35,3 +38,9 @@ name = "commonmark_js"
path = "fuzz_targets/commonmark_js.rs"
test = false
doc = false

[[bin]]
name = "pandoc"
path = "fuzz_targets/pandoc.rs"
test = false
doc = false
222 changes: 222 additions & 0 deletions fuzz/fuzz_targets/pandoc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#![no_main]

//! Differential fuzzing of pulldown-cmark and pandoc commonmark.
//!
//! This fuzzer sends the same input to both Markdown parsers and
//! compares the output. Pandoc's JSON AST is turned into
//! `pulldown_cmark::Event` values for this purpose.
//!
//! Run the fuzzer like this to only test ASCII input (which is
//! usually enough to find parsing differences):
//!
//! cargo fuzz run pandoc -- -only_ascii=1

use libfuzzer_sys::fuzz_target;
use pretty_assertions::assert_eq;
use pulldown_cmark_fuzz::{PandocHandle, normalize_pandoc, print_events, pulldown_cmark_ext, xml_to_events, commonmark_js};
use std::sync::OnceLock;

static PANDOC: OnceLock<PandocHandle> = OnceLock::new();

fuzz_target!(|text: String| {
// There are some differences in handling of non-UTF-8 input.
if text.bytes().any(|b| b.is_ascii_control() && b != b'\n') {
return;
}

// https://github.com/jgm/commonmark-hs/issues/149
if text.contains(r"\&") {
return;
}
if text.contains("[^ ") {
return;
}
if text.contains("[^\t") {
return;
}
if text.contains("[^\n") {
return;
}
if text.contains("[^\r") {
return;
}
if text.contains(" ]") {
return;
}
if text.contains("\t]") {
return;
}
if text.contains("\n]") {
return;
}
if text.contains("\r]") {
return;
}
if text.contains("[ ") {
return;
}
if text.contains("[\t") {
return;
}
if text.contains("[\n") {
return;
}
if text.contains("[\r") {
return;
}
// https://github.com/jgm/commonmark-hs/issues/136
if text.contains("<") || text.contains("`") {
return;
}

if text.bytes().any(|b| b > 127) {
return;
}

// There are some trivial differences due to trailing whitespace.
let mut text = text
.lines()
.map(|line| line.trim_end())
.collect::<Vec<_>>()
.join("\n");
text.push('\n');

let pulldown_cmark_events = pulldown_cmark_ext(&text);

// Make sure there aren't cyclical footnotes.
// pulldown-cmark supports them, but pandoc does not
// and making it work would require completely redesigning
// the pandoc ast.
//
// commonmark-hs also trims footnote reference names.
// I'm not going to bother reporting it as a bug, since it's
// obviously on purpose and it's not gonna matter.
let mut footstack = vec![];
for event in &pulldown_cmark_events {
use pulldown_cmark::{Event, Tag, TagEnd};
match event {
Event::Start(Tag::FootnoteDefinition(id)) => {
if id.starts_with("\n") || id.ends_with("\n") || id.starts_with("\r") || id.ends_with("\r") || id.starts_with(" ") || id.starts_with("\t") || id.contains(" ") || id.contains("\t ") || id.contains(" \t") || id.contains("\t\t") || id.ends_with(" ") || id.ends_with("\t") { return };
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps it would be simpler to use the slice variants of the starts_with and ends_with patterns:

Suggested change
if id.starts_with("\n") || id.ends_with("\n") || id.starts_with("\r") || id.ends_with("\r") || id.starts_with(" ") || id.starts_with("\t") || id.contains(" ") || id.contains("\t ") || id.contains(" \t") || id.contains("\t\t") || id.ends_with(" ") || id.ends_with("\t") { return };
let whitespace = &['\n', '\r', ' ', '\t'];
if id.starts_with(whitespace)
|| id.ends_with(whitespace)
|| id.contains("\t ")
|| id.contains(" \t")
|| id.contains("\t\t")
{
return;
};

footstack.push(id.trim().to_string());
}
Event::End(TagEnd::FootnoteDefinition) => {
footstack.pop();
}
Event::FootnoteReference(id) => {
if !footstack.is_empty() {
//if footstack.contains(&id.trim().to_string()) {
return;
}
}
_ => {}
}
}
let pulldown_cmark_events = normalize_pandoc(pulldown_cmark_events);
let mut footstack = vec![];
let mut liststack = 0;
for event in &pulldown_cmark_events {
use pulldown_cmark::{Event, Tag, TagEnd};
match event {
Event::Start(Tag::CodeBlock(..)) => {
// differences in list tightness
// https://github.com/jgm/commonmark-hs/issues/144
if liststack != 0 {
return;
}
}
Event::Start(Tag::Item) => {
// differences in list tightness
liststack += 1;
}
Event::End(TagEnd::Item) => {
// differences in list tightness
liststack -= 1;
}
Event::Start(Tag::FootnoteDefinition(id)) => {
footstack.push(id.to_string());
}
Event::End(TagEnd::FootnoteDefinition) => {
footstack.pop();
}
Event::FootnoteReference(id) => {
if footstack.contains(&id.to_string()) {
return;
}
}
_ => {}
}
}

let pandoc = PANDOC.get_or_init(PandocHandle::new);

let pandoc_ast = match pandoc.get_ast(&text) {
Ok(pandoc_ast) => pandoc_ast,
Err(err) => {
print_events(&text, &pulldown_cmark_events);
if err.to_string().contains("recursion limit") {
return;
} else {
panic!("Could not get Pandoc JSON: {}", err);
}
}
};

let raw_events = match pandoc_ast.to_events() {
Ok(raw_events) => raw_events,
Err(err) => {
print_events(&text, &pulldown_cmark_events);
eprintln!("AST from pandoc:\n{pandoc_ast:#?}");
panic!("Could not convert Pandoc AST: {}", err);
}
};

let pandoc_events = normalize_pandoc(raw_events);
if pulldown_cmark_events != pandoc_events {
eprintln!("Events from pulldown-cmark:\n\n```rust");
print_events(&text, &pulldown_cmark_events);
eprintln!("```");
eprintln!();

eprintln!("Events from pandoc:\n\n```rust");
print_events(&text, &pandoc_events);
eprintln!("```");
eprintln!();

// For completeness's sake, also include commonmark.js
// It's not used in this fuzzer, but it's convenient to tell if it's
// just Pandoc being weird.
let commonmark_js_xml = &commonmark_js(&text).unwrap();

match xml_to_events(commonmark_js_xml) {
Ok(raw_events_cmjs) => {
let commonmark_js_events = normalize_pandoc(raw_events_cmjs);
eprintln!("Events from commonmark.js:\n\n```rust");
print_events(&text, &commonmark_js_events);
eprintln!("```");
eprintln!();
},
Err(..) => {}
};

#[derive(serde::Serialize)]
struct DingusParams<'a> {
text: &'a str,
to: &'a str,
from: &'a str,
}
let dingus_params = serde_json::to_string(&DingusParams {
text: &text,
to: "json",
from: "commonmark",
}).unwrap();

let dingus_url = format!(
"https://pandoc.org/try/?params={}",
urlencoding::encode(&dingus_params)
);
eprintln!("AST from [pandoc]({dingus_url}):\n\n```text\n{pandoc_ast:#?}\n```");
eprintln!();

assert_eq!(pulldown_cmark_events, pandoc_events);
}
});