Skip to content

Commit

Permalink
TIKA-4248 -- improve handling of attachments in PST (#1738)
Browse files Browse the repository at this point in the history
* TIKA-4248 -- improve PST handling of embedded files.
  • Loading branch information
tballison committed Apr 30, 2024
1 parent 63b7e91 commit de282d2
Show file tree
Hide file tree
Showing 8 changed files with 368 additions and 237 deletions.
7 changes: 7 additions & 0 deletions CHANGES.txt
@@ -1,3 +1,10 @@
Release 3.0.0-BETA2 - ???

BREAKING CHANGES

* Updated PST parser to use standard Message metadata keys and improved
handling of embedded files (TIKA-4248).

Release 3.0.0-BETA - 12/01/2023

BREAKING CHANGES
Expand Down
8 changes: 8 additions & 0 deletions tika-core/src/main/java/org/apache/tika/metadata/Office.java
Expand Up @@ -176,4 +176,12 @@ public interface Office {
Property PROG_ID = Property.internalText("msoffice:progID");

Property OCX_NAME = Property.internalText("msoffice:ocxName");
Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-recipients-string");
Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged");
}
26 changes: 26 additions & 0 deletions tika-core/src/main/java/org/apache/tika/metadata/PST.java
@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.tika.metadata;

public interface PST {

String PST_PREFIX = "pst:";
Property PST_FOLDER_PATH = Property.internalText(PST_PREFIX + "folderPath");
Property DESCRIPTOR_NODE_ID = Property.internalText(PST_PREFIX + "discriptorNodeId");
Property IS_VALID = Property.internalBoolean(PST_PREFIX + "isValid");
}
Expand Up @@ -139,7 +139,25 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
} finally {
xhtml.endDocument();
}
}

public void parseString(String html, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException {
// Get the HTML mapper from the parse context
HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper());

//do better with baseUri?
Document document = Jsoup.parse(html);
document.quirksMode(Document.QuirksMode.quirks);
ContentHandler xhtml = new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context, extractScripts));
xhtml.startDocument();
try {
NodeTraversor.filter(new TikaNodeFilter(xhtml), document);
} catch (RuntimeSAXException e) {
throw e.getWrapped();
} finally {
xhtml.endDocument();
}
}

private class TikaNodeFilter implements NodeFilter {
Expand Down
Expand Up @@ -17,20 +17,15 @@
package org.apache.tika.parser.microsoft.pst;

import static java.lang.String.valueOf;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Collections.singleton;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;

import com.pff.PSTAttachment;
import com.pff.PSTException;
import com.pff.PSTFile;
import com.pff.PSTFolder;
import com.pff.PSTMessage;
import com.pff.PSTRecipient;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
Expand All @@ -39,14 +34,12 @@
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PST;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OutlookExtractor;
import org.apache.tika.sax.XHTMLContentHandler;

/**
Expand Down Expand Up @@ -88,13 +81,13 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
pstFile = new PSTFile(in.getFile().getPath());
metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
boolean isValid = pstFile.getFileHandle().getFD().valid();
metadata.set("isValid", valueOf(isValid));
metadata.set(PST.IS_VALID, isValid);
if (pstFile.getPSTFileType() == PSTFile.PST_TYPE_2013_UNICODE) {
throw new TikaException(
"OST 2013 support not added yet. It will be when https://github.com/rjohnsondev/java-libpst/issues/60 is fixed.");
}
if (isValid) {
parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
parseFolder(xhtml, pstFile.getRootFolder(), "/", embeddedExtractor);
}
} catch (TikaException e) {
throw e;
Expand All @@ -113,12 +106,19 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
xhtml.endDocument();
}

private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder,
private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, String folderPath,
EmbeddedDocumentExtractor embeddedExtractor) throws Exception {
if (pstFolder.getContentCount() > 0) {
PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
while (pstMail != null) {
parseMailAndAttachments(handler, pstMail, embeddedExtractor);
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING);
metadata.set(PST.PST_FOLDER_PATH, folderPath);
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
tis.setOpenContainer(pstMail);
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
embeddedExtractor.parseEmbedded(tis, handler, metadata, true);
}
pstMail = (PSTMessage) pstFolder.getNextChild();
}
}
Expand All @@ -127,163 +127,11 @@ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder,
for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
handler.startElement("div", createAttribute("class", "email-folder"));
handler.element("h1", pstSubFolder.getDisplayName());
parseFolder(handler, pstSubFolder, embeddedExtractor);
String subFolderPath = folderPath.endsWith("/") ? folderPath + pstSubFolder.getDisplayName() :
folderPath + "/" + pstFolder.getDisplayName();
parseFolder(handler, pstSubFolder, subFolderPath, embeddedExtractor);
handler.endElement("div");
}
}
}

private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail,
EmbeddedDocumentExtractor embeddedExtractor)
throws SAXException, IOException, TikaException {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
handler.startElement("div", attributes);
handler.element("h1", pstMail.getSubject());

final Metadata mailMetadata = new Metadata();
// parse attachments first so that stream exceptions
// in attachments can make it into mailMetadata.
// RecursiveParserWrapper copies the metadata and thereby prevents
// modifications to mailMetadata from making it into the
// metadata objects cached by the RecursiveParserWrapper
parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);

handler.endElement("div");
}

private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail,
Metadata mailMetadata, EmbeddedDocumentExtractor embeddedExtractor)
throws SAXException, IOException {
mailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
mailMetadata
.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
mailMetadata.set(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME, pstMail.getClientSubmitTime());
mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
mailMetadata.set("recipients", pstMail.getRecipientsString());
mailMetadata.set("displayTo", pstMail.getDisplayTo());
mailMetadata.set("displayCC", pstMail.getDisplayCC());
mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
mailMetadata.set("importance", valueOf(pstMail.getImportance()));
mailMetadata.set("priority", valueOf(pstMail.getPriority()));
mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
mailMetadata.set(Office.MAPI_MESSAGE_CLASS,
OutlookExtractor.getMessageClass(pstMail.getMessageClass()));

mailMetadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress());

mailMetadata.set(Office.MAPI_FROM_REPRESENTING_EMAIL,
pstMail.getSentRepresentingEmailAddress());

mailMetadata.set(Message.MESSAGE_FROM_NAME, pstMail.getSenderName());
mailMetadata.set(Office.MAPI_FROM_REPRESENTING_NAME, pstMail.getSentRepresentingName());

//add recipient details
try {
for (int i = 0; i < pstMail.getNumberOfRecipients(); i++) {
PSTRecipient recipient = pstMail.getRecipient(i);
switch (OutlookExtractor.RECIPIENT_TYPE
.getTypeFromVal(recipient.getRecipientType())) {
case TO:
OutlookExtractor.addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME,
recipient.getDisplayName(), mailMetadata);
OutlookExtractor.addEvenIfNull(Message.MESSAGE_TO_EMAIL,
recipient.getEmailAddress(), mailMetadata);
break;
case CC:
OutlookExtractor.addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME,
recipient.getDisplayName(), mailMetadata);
OutlookExtractor.addEvenIfNull(Message.MESSAGE_CC_EMAIL,
recipient.getEmailAddress(), mailMetadata);
break;
case BCC:
OutlookExtractor.addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME,
recipient.getDisplayName(), mailMetadata);
OutlookExtractor.addEvenIfNull(Message.MESSAGE_BCC_EMAIL,
recipient.getEmailAddress(), mailMetadata);
break;
default:
//do we want to handle unspecified or unknown?
break;
}
}
} catch (PSTException e) {
//swallow
}
//we may want to experiment with working with the bodyHTML.
//However, because we can't get the raw bytes, we _could_ wind up sending
//a UTF-8 byte representation of the html that has a conflicting metaheader
//that causes the HTMLParser to get the encoding wrong. Better if we could get
//the underlying bytes from the pstMail object...

byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE,
MediaType.TEXT_PLAIN.toString());
embeddedExtractor
.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
}

private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email,
final Metadata mailMetadata,
EmbeddedDocumentExtractor embeddedExtractor)
throws TikaException {
int numberOfAttachments = email.getNumberOfAttachments();
for (int i = 0; i < numberOfAttachments; i++) {
try {
PSTAttachment attach = email.getAttachment(i);

PSTMessage attachedEmail = attach.getEmbeddedPSTMessage();
if (attachedEmail != null) {
parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor);
continue;
}

// Get the filename; both long and short filenames can be used for attachments
String filename = attach.getLongFilename();
if (filename.isEmpty()) {
filename = attach.getFilename();
}

xhtml.element("p", filename);

Metadata attachMeta = new Metadata();
attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", filename);
xhtml.startElement("div", attributes);
if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
TikaInputStream tis = null;
try {
tis = TikaInputStream.get(attach.getFileInputStream());
} catch (NullPointerException e) { //TIKA-2488
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, mailMetadata);
continue;
}

try {
embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, false);
} finally {
tis.close();
}
}
xhtml.endElement("div");

} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, mailMetadata);
}
}
}

}

0 comments on commit de282d2

Please sign in to comment.