Skip to content

Commit

Permalink
TIKA-4249 -- allow utf8 bom to at start of rfc822 detection (#1739)
Browse files Browse the repository at this point in the history
(cherry picked from commit 9f8a2f5)
  • Loading branch information
tballison committed May 1, 2024
1 parent b419cf5 commit b0f3be8
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6929,6 +6929,7 @@
<magic priority="45">
<!-- be a bit more flexible, but require one from each of these -->
<match minShouldMatch="2">

<match minShouldMatch="1">
<match value="Content-ID:" type="stringignorecase" offset="0"/>
<match value="Content-Location:" type="stringignorecase" offset="0"/>
Expand All @@ -6949,6 +6950,27 @@
<match value="User-Agent:" type="string" offset="0"/>
<match value="X-Mailer:" type="string" offset="0"/>
<match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
<match value="0xefbbbf" type="string" offset="0">
<match value="Content-ID:" type="stringignorecase" offset="3"/>
<match value="Content-Location:" type="stringignorecase" offset="3"/>
<match value="Content-Transfer-Encoding:" type="stringignorecase" offset="3"/>
<match value="Content-Type:" type="stringignorecase" offset="3"/>
<match value="Date:" type="stringignorecase" offset="3"/>
<match value="Delivered-To:" type="string" offset="3"/>
<match value="From:" type="stringignorecase" offset="3"/>
<match value="Message-ID:" type="stringignorecase" offset="3"/>
<match value="MIME-Version:" type="stringignorecase" offset="3"/>
<match value="Received:" type="stringignorecase" offset="3"/>
<match value="Relay-Version:" type="stringignorecase" offset="3"/>
<match value="Return-Path:" type="stringignorecase" offset="3"/>
<match value="Sent:" type="string" offset="3"/>
<match value="Status:" type="string" offset="3"/>
<match value="Subject:" type="string" offset="3"/>
<match value="To:" type="string" offset="3"/>
<match value="User-Agent:" type="string" offset="3"/>
<match value="X-Mailer:" type="string" offset="3"/>
<match value="X-Originating-IP:" type="stringignorecase" offset="3"/>
</match>
</match>
<match minShouldMatch="1">
<match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.io.InputStream;
import java.net.URL;

import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -108,6 +110,27 @@ public void testByteOrderMark() throws Exception {
.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
}

@Test
public void testRFC822WithBOM() throws Exception {
String header = "From: blah <blah@blah.com>\r\n" + "Received: Friday, January 24, 2020 3:24 PM\r\n" +
"To: someone@somewhere.com\r\n" + "Cc: someone-else@other.com\r\n" +
"Subject: Received\r\n";
MediaType rfc822 = MediaType.parse("message/rfc822");
assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream
.builder()
.setByteArray(header.getBytes(UTF_8))
.get(), new Metadata()));

int utfLength = ByteOrderMark.UTF_8.length();
byte[] bytes = new byte[header.getBytes(UTF_8).length + utfLength];
System.arraycopy(ByteOrderMark.UTF_8.getBytes(), 0, bytes, 0, utfLength);
System.arraycopy(header.getBytes(UTF_8), 0, bytes, 3, header.getBytes(UTF_8).length);
assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream
.builder()
.setByteArray(bytes)
.get(), new Metadata()));
}

@Test
public void testSuperTypes() {
assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
Expand Down

0 comments on commit b0f3be8

Please sign in to comment.