Skip to content

Commit

Permalink
Support detecting doc/xls/ppt (#38)
Browse files Browse the repository at this point in the history
* Check docProps in msooxml matcher

* Allow clippy::upper-case-acronyms lint

* Fix typo: OOXLM -> OOXML

* Support detecting doc/xls/ppt
  • Loading branch information
messense committed Apr 12, 2021
1 parent fa860c4 commit 0d0dee6
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 36 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Expand Up @@ -14,10 +14,13 @@ exclude = ["testdata/*", "tests/*"]

[features]
default = ["std"]
std = ["alloc"]
std = ["alloc", "cfb"]
alloc = []

[[example]]
name = "file"
path = "examples/file.rs"
required-features = ["std"]

[dependencies]
cfb = { version = "0.4.0", optional = true }
1 change: 0 additions & 1 deletion README.md
Expand Up @@ -209,7 +209,6 @@ assert_eq!(kind.extension(), "foo");

## Known Issues

- `doc`, `ppt`, `xls`, `msi` all have the same magic number so it's not possible to tell which one just based on the binary data. `doc` is returned for all.
- `exe` and `dll` have the same magic number so it's not possible to tell which one just based on the binary data. `exe` is returned for all.

## License
Expand Down
70 changes: 37 additions & 33 deletions src/matchers/doc.rs
Expand Up @@ -3,26 +3,18 @@ use core::convert::TryInto;
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Eq, PartialEq)]
enum DocType {
// DOC,
DOC,
DOCX,
// XLS,
XLS,
XLSX,
// PPT,
PPT,
PPTX,
OOXLM,
OOXML,
}

/// Returns whether a buffer is Microsoft Word Document (DOC) data.
pub fn is_doc(buf: &[u8]) -> bool {
buf.len() > 7
&& buf[0] == 0xD0
&& buf[1] == 0xCF
&& buf[2] == 0x11
&& buf[3] == 0xE0
&& buf[4] == 0xA1
&& buf[5] == 0xB1
&& buf[6] == 0x1A
&& buf[7] == 0xE1
ole2(buf) == Some(DocType::DOC)
}

/// Returns whether a buffer is Microsoft Word Open XML Format Document (DOCX) data.
Expand All @@ -32,15 +24,7 @@ pub fn is_docx(buf: &[u8]) -> bool {

/// Returns whether a buffer is Microsoft Excel 97-2003 Worksheet (XLS) data.
pub fn is_xls(buf: &[u8]) -> bool {
buf.len() > 7
&& buf[0] == 0xD0
&& buf[1] == 0xCF
&& buf[2] == 0x11
&& buf[3] == 0xE0
&& buf[4] == 0xA1
&& buf[5] == 0xB1
&& buf[6] == 0x1A
&& buf[7] == 0xE1
ole2(buf) == Some(DocType::XLS)
}

/// Returns whether a buffer is Microsoft Excel Open XML Format Spreadsheet (XLSX) data.
Expand All @@ -50,15 +34,7 @@ pub fn is_xlsx(buf: &[u8]) -> bool {

/// Returns whether a buffer is Microsoft PowerPoint 97-2003 Presentation (PPT) data.
pub fn is_ppt(buf: &[u8]) -> bool {
buf.len() > 7
&& buf[0] == 0xD0
&& buf[1] == 0xCF
&& buf[2] == 0x11
&& buf[3] == 0xE0
&& buf[4] == 0xA1
&& buf[5] == 0xB1
&& buf[6] == 0x1A
&& buf[7] == 0xE1
ole2(buf) == Some(DocType::PPT)
}

/// Returns whether a buffer is Microsoft PowerPoint Open XML Presentation (PPTX) data.
Expand Down Expand Up @@ -108,15 +84,43 @@ fn msooxml(buf: &[u8]) -> Option<DocType> {
let idx = search(buf, start_offset, 6000);
match idx {
Some(idx) => start_offset += idx + 4 + 26,
None => return Some(DocType::OOXLM),
None => return Some(DocType::OOXML),
};

let typo = check_msooml(buf, start_offset);
if typo.is_some() {
return typo;
}

Some(DocType::OOXLM)
Some(DocType::OOXML)
}

#[cfg(feature = "std")]
fn ole2(buf: &[u8]) -> Option<DocType> {
use std::io::Cursor;

if !compare_bytes(buf, &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 0) {
return None;
}
if let Ok(file) = cfb::CompoundFile::open(Cursor::new(buf)) {
return match file.root_entry().clsid().to_string().as_str() {
"00020810-0000-0000-c000-000000000046" | "00020820-0000-0000-c000-000000000046" => {
Some(DocType::XLS)
}
"00020906-0000-0000-c000-000000000046" => Some(DocType::DOC),
"64818d10-4f9b-11cf-86ea-00aa00b929e8" => Some(DocType::PPT),
_ => None,
};
}
None
}

#[cfg(not(feature = "std"))]
fn ole2(buf: &[u8]) -> Option<DocType> {
if !compare_bytes(buf, &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 0) {
return None;
}
Some(DocType::DOC)
}

fn compare_bytes(slice: &[u8], sub_slice: &[u8], start_offset: usize) -> bool {
Expand Down
38 changes: 37 additions & 1 deletion tests/doc.rs
@@ -1,6 +1,30 @@
mod common;

test_format!(DOC, "application/msword", "doc", doc, "sample.doc");
#[cfg(feature = "std")]
macro_rules! test_format_get_only {
($exp_matchert:ident, $exp_mimet:expr, $exp_ext:expr, $format:ident, $file:expr) => {
mod $format {
use infer::{MatcherType, Type};

fn matcher(_buf: &[u8]) -> bool {
false
}

#[test]
fn get() {
let expected_kind =
Type::new(MatcherType::$exp_matchert, $exp_mimet, $exp_ext, matcher);
let buf = include_bytes!(concat!("../testdata/", $file));
let kind = infer::get(buf).expect("test file matches");

assert_eq!(expected_kind, kind);
}
}
};
}

#[cfg(feature = "std")]
test_format_get_only!(DOC, "application/msword", "doc", doc, "sample.doc");

test_format!(
DOC,
Expand All @@ -10,6 +34,9 @@ test_format!(
"sample.docx"
);

#[cfg(feature = "std")]
test_format_get_only!(DOC, "application/vnd.ms-excel", "xls", xls, "sample.xls");

test_format!(
DOC,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
Expand All @@ -18,6 +45,15 @@ test_format!(
"sample.xlsx"
);

#[cfg(feature = "std")]
test_format_get_only!(
DOC,
"application/vnd.ms-powerpoint",
"ppt",
ppt,
"sample.ppt"
);

test_format!(
DOC,
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
Expand Down

0 comments on commit 0d0dee6

Please sign in to comment.