From de282d2861009895eecdb07784dceb5d777f372a Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Tue, 30 Apr 2024 12:15:38 -0400 Subject: [PATCH] TIKA-4248 -- improve handling of attachments in PST (#1738) * TIKA-4248 -- improve PST handling of embedded files. --- CHANGES.txt | 7 + .../java/org/apache/tika/metadata/Office.java | 8 + .../java/org/apache/tika/metadata/PST.java | 26 ++ .../apache/tika/parser/html/JSoupParser.java | 18 ++ .../microsoft/pst/OutlookPSTParser.java | 182 +----------- .../microsoft/pst/PSTMailItemParser.java | 270 ++++++++++++++++++ .../services/org.apache.tika.parser.Parser | 1 + .../microsoft/pst/OutlookPSTParserTest.java | 93 ++---- 8 files changed, 368 insertions(+), 237 deletions(-) create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/PST.java create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java diff --git a/CHANGES.txt b/CHANGES.txt index 187aa2f553..3aa2c7b44a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,10 @@ +Release 3.0.0-BETA2 - ??? + + BREAKING CHANGES + + * Updated PST parser to use standard Message metadata keys and improved + handling of embedded files (TIKA-4248). + Release 3.0.0-BETA - 12/01/2023 BREAKING CHANGES diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 815f060c16..2a9e428eb0 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -176,4 +176,12 @@ public interface Office { Property PROG_ID = Property.internalText("msoffice:progID"); Property OCX_NAME = Property.internalText("msoffice:ocxName"); + Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-recipients-string"); + Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance"); + Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance"); + Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PST.java b/tika-core/src/main/java/org/apache/tika/metadata/PST.java new file mode 100644 index 0000000000..d977c2e19f --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/PST.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.apache.tika.metadata; + +public interface PST { + + String PST_PREFIX = "pst:"; + Property PST_FOLDER_PATH = Property.internalText(PST_PREFIX + "folderPath"); + Property DESCRIPTOR_NODE_ID = Property.internalText(PST_PREFIX + "discriptorNodeId"); + Property IS_VALID = Property.internalBoolean(PST_PREFIX + "isValid"); +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java index b9bc4db8bf..05b2bcd732 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java @@ -139,7 +139,25 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } finally { xhtml.endDocument(); } + } + + public void parseString(String html, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException { + // Get the HTML mapper from the parse context + HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper()); + //do better with baseUri? + Document document = Jsoup.parse(html); + document.quirksMode(Document.QuirksMode.quirks); + ContentHandler xhtml = new XHTMLDowngradeHandler( + new HtmlHandler(mapper, handler, metadata, context, extractScripts)); + xhtml.startDocument(); + try { + NodeTraversor.filter(new TikaNodeFilter(xhtml), document); + } catch (RuntimeSAXException e) { + throw e.getWrapped(); + } finally { + xhtml.endDocument(); + } } private class TikaNodeFilter implements NodeFilter { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java index 9257b0e4e3..ded254489e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java @@ -17,20 +17,15 @@ package org.apache.tika.parser.microsoft.pst; import static java.lang.String.valueOf; -import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Collections.singleton; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Set; -import com.pff.PSTAttachment; -import com.pff.PSTException; import com.pff.PSTFile; import com.pff.PSTFolder; import com.pff.PSTMessage; -import com.pff.PSTRecipient; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -39,14 +34,12 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.microsoft.OutlookExtractor; import org.apache.tika.sax.XHTMLContentHandler; /** @@ -88,13 +81,13 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, pstFile = new PSTFile(in.getFile().getPath()); metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length())); boolean isValid = pstFile.getFileHandle().getFD().valid(); - metadata.set("isValid", valueOf(isValid)); + metadata.set(PST.IS_VALID, isValid); if (pstFile.getPSTFileType() == PSTFile.PST_TYPE_2013_UNICODE) { throw new TikaException( "OST 2013 support not added yet. It will be when https://github.com/rjohnsondev/java-libpst/issues/60 is fixed."); } if (isValid) { - parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor); + parseFolder(xhtml, pstFile.getRootFolder(), "/", embeddedExtractor); } } catch (TikaException e) { throw e; @@ -113,12 +106,19 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.endDocument(); } - private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, + private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, String folderPath, EmbeddedDocumentExtractor embeddedExtractor) throws Exception { if (pstFolder.getContentCount() > 0) { PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); while (pstMail != null) { - parseMailAndAttachments(handler, pstMail, embeddedExtractor); + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); + metadata.set(PST.PST_FOLDER_PATH, folderPath); + try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { + tis.setOpenContainer(pstMail); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + embeddedExtractor.parseEmbedded(tis, handler, metadata, true); + } pstMail = (PSTMessage) pstFolder.getNextChild(); } } @@ -127,163 +127,11 @@ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) { handler.startElement("div", createAttribute("class", "email-folder")); handler.element("h1", pstSubFolder.getDisplayName()); - parseFolder(handler, pstSubFolder, embeddedExtractor); + String subFolderPath = folderPath.endsWith("/") ? folderPath + pstSubFolder.getDisplayName() : + folderPath + "/" + pstFolder.getDisplayName(); + parseFolder(handler, pstSubFolder, subFolderPath, embeddedExtractor); handler.endElement("div"); } } } - - private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail, - EmbeddedDocumentExtractor embeddedExtractor) - throws SAXException, IOException, TikaException { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId()); - handler.startElement("div", attributes); - handler.element("h1", pstMail.getSubject()); - - final Metadata mailMetadata = new Metadata(); - // parse attachments first so that stream exceptions - // in attachments can make it into mailMetadata. - // RecursiveParserWrapper copies the metadata and thereby prevents - // modifications to mailMetadata from making it into the - // metadata objects cached by the RecursiveParserWrapper - parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor); - parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor); - - handler.endElement("div"); - } - - private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, - Metadata mailMetadata, EmbeddedDocumentExtractor embeddedExtractor) - throws SAXException, IOException { - mailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); - mailMetadata - .set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); - mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); - mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject()); - mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); - mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); - mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); - mailMetadata.set(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME, pstMail.getClientSubmitTime()); - mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); - mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); - mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId())); - mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress()); - mailMetadata.set("recipients", pstMail.getRecipientsString()); - mailMetadata.set("displayTo", pstMail.getDisplayTo()); - mailMetadata.set("displayCC", pstMail.getDisplayCC()); - mailMetadata.set("displayBCC", pstMail.getDisplayBCC()); - mailMetadata.set("importance", valueOf(pstMail.getImportance())); - mailMetadata.set("priority", valueOf(pstMail.getPriority())); - mailMetadata.set("flagged", valueOf(pstMail.isFlagged())); - mailMetadata.set(Office.MAPI_MESSAGE_CLASS, - OutlookExtractor.getMessageClass(pstMail.getMessageClass())); - - mailMetadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress()); - - mailMetadata.set(Office.MAPI_FROM_REPRESENTING_EMAIL, - pstMail.getSentRepresentingEmailAddress()); - - mailMetadata.set(Message.MESSAGE_FROM_NAME, pstMail.getSenderName()); - mailMetadata.set(Office.MAPI_FROM_REPRESENTING_NAME, pstMail.getSentRepresentingName()); - - //add recipient details - try { - for (int i = 0; i < pstMail.getNumberOfRecipients(); i++) { - PSTRecipient recipient = pstMail.getRecipient(i); - switch (OutlookExtractor.RECIPIENT_TYPE - .getTypeFromVal(recipient.getRecipientType())) { - case TO: - OutlookExtractor.addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, - recipient.getDisplayName(), mailMetadata); - OutlookExtractor.addEvenIfNull(Message.MESSAGE_TO_EMAIL, - recipient.getEmailAddress(), mailMetadata); - break; - case CC: - OutlookExtractor.addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, - recipient.getDisplayName(), mailMetadata); - OutlookExtractor.addEvenIfNull(Message.MESSAGE_CC_EMAIL, - recipient.getEmailAddress(), mailMetadata); - break; - case BCC: - OutlookExtractor.addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, - recipient.getDisplayName(), mailMetadata); - OutlookExtractor.addEvenIfNull(Message.MESSAGE_BCC_EMAIL, - recipient.getEmailAddress(), mailMetadata); - break; - default: - //do we want to handle unspecified or unknown? - break; - } - } - } catch (PSTException e) { - //swallow - } - //we may want to experiment with working with the bodyHTML. - //However, because we can't get the raw bytes, we _could_ wind up sending - //a UTF-8 byte representation of the html that has a conflicting metaheader - //that causes the HTMLParser to get the encoding wrong. Better if we could get - //the underlying bytes from the pstMail object... - - byte[] mailContent = pstMail.getBody().getBytes(UTF_8); - mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE, - MediaType.TEXT_PLAIN.toString()); - embeddedExtractor - .parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true); - } - - private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, - final Metadata mailMetadata, - EmbeddedDocumentExtractor embeddedExtractor) - throws TikaException { - int numberOfAttachments = email.getNumberOfAttachments(); - for (int i = 0; i < numberOfAttachments; i++) { - try { - PSTAttachment attach = email.getAttachment(i); - - PSTMessage attachedEmail = attach.getEmbeddedPSTMessage(); - if (attachedEmail != null) { - parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor); - continue; - } - - // Get the filename; both long and short filenames can be used for attachments - String filename = attach.getLongFilename(); - if (filename.isEmpty()) { - filename = attach.getFilename(); - } - - xhtml.element("p", filename); - - Metadata attachMeta = new Metadata(); - attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); - attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename); - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", filename); - xhtml.startElement("div", attributes); - if (embeddedExtractor.shouldParseEmbedded(attachMeta)) { - TikaInputStream tis = null; - try { - tis = TikaInputStream.get(attach.getFileInputStream()); - } catch (NullPointerException e) { //TIKA-2488 - EmbeddedDocumentUtil.recordEmbeddedStreamException(e, mailMetadata); - continue; - } - - try { - embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, false); - } finally { - tis.close(); - } - } - xhtml.endElement("div"); - - } catch (Exception e) { - EmbeddedDocumentUtil.recordEmbeddedStreamException(e, mailMetadata); - } - } - } - } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java new file mode 100644 index 0000000000..f0fbd9f68b --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.pst; + +import static java.lang.String.valueOf; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Set; + +import com.pff.PSTAttachment; +import com.pff.PSTException; +import com.pff.PSTMessage; +import com.pff.PSTRecipient; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Message; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.PST; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.JSoupParser; +import org.apache.tika.parser.microsoft.OutlookExtractor; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; + +public class PSTMailItemParser implements Parser { + + //this is a synthetic file type to represent a notional "pst item" + public static final MediaType PST_MAIL_ITEM = MediaType.application("x-tika-pst-mail-item"); + public static final String PST_MAIL_ITEM_STRING = PST_MAIL_ITEM.toString(); + public static final Set SUPPORTED_ITEMS = Set.of(PST_MAIL_ITEM); + + @Override + public Set getSupportedTypes(ParseContext context) { + return SUPPORTED_ITEMS; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + TikaInputStream tis = TikaInputStream.cast(stream); + if (tis == null) { + throw new TikaException("Stream must be a TikaInputStream"); + } + Object openContainerObj = tis.getOpenContainer(); + if (openContainerObj == null) { + throw new TikaException("Open container must not be null."); + } + if (! (openContainerObj instanceof PSTMessage)) { + throw new TikaException("Open container must be a PSTMessage"); + } + PSTMessage pstMsg = (PSTMessage) openContainerObj; + EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + parseMailAndAttachments(pstMsg, xhtml, metadata, context, ex); + xhtml.endDocument(); + } + + private void parseMailAndAttachments(PSTMessage pstMsg, XHTMLContentHandler handler, Metadata metadata, ParseContext context, + EmbeddedDocumentExtractor embeddedExtractor) + throws SAXException, IOException, TikaException { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", pstMsg.getInternetMessageId()); + handler.startElement("div", attributes); + handler.element("h1", pstMsg.getSubject()); + + parseMailItem(pstMsg, handler, metadata, context); + parseMailAttachments(pstMsg, handler, metadata, context, embeddedExtractor); + handler.endElement("div"); + } + + private void parseMailItem(PSTMessage pstMail, XHTMLContentHandler xhtml, + Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException { + extractMetadata(pstMail, metadata); + //try the html first. It preserves logical paragraph markers + String htmlChunk = pstMail.getBodyHTML(); + if (! StringUtils.isBlank(htmlChunk)) { + Parser htmlParser = EmbeddedDocumentUtil + .tryToFindExistingLeafParser(JSoupParser.class, context); + if (htmlParser == null) { + htmlParser = new JSoupParser(); + } + if (htmlParser instanceof JSoupParser) { + ((JSoupParser)htmlParser).parseString(htmlChunk, + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + metadata, context); + } else { + byte[] data = htmlChunk.getBytes(StandardCharsets.UTF_8); + htmlParser.parse(new UnsynchronizedByteArrayInputStream(data), + new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), context); + } + return; + } + //if there's no html, back off to straight text -- TODO maybe add RTF parsing? + //splitting on "\r\n|\n" doesn't work because the new lines in the + //body are not logical new lines...they are presentation new lines. + String mailContent = pstMail.getBody(); + xhtml.startElement("p"); + xhtml.characters(mailContent); + xhtml.endElement("p"); + } + + private void extractMetadata(PSTMessage pstMail, Metadata metadata) { + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); + metadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); + metadata.set(TikaCoreProperties.TITLE, pstMail.getSubject()); + metadata.set(TikaCoreProperties.SUBJECT, pstMail.getSubject()); + metadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); + metadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); + metadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); + metadata.set(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME, pstMail.getClientSubmitTime()); + metadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); + metadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); + metadata.set(PST.DESCRIPTOR_NODE_ID, valueOf(pstMail.getDescriptorNodeId())); + metadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress()); + if (! StringUtils.isBlank(pstMail.getRecipientsString()) && + ! pstMail.getRecipientsString().equals("No recipients table!")) { + metadata.set(Office.MAPI_RECIPIENTS_STRING, pstMail.getRecipientsString()); + } + metadata.set(Message.MESSAGE_TO_DISPLAY_NAME, pstMail.getDisplayTo()); + metadata.set(Message.MESSAGE_CC_DISPLAY_NAME, pstMail.getDisplayCC()); + metadata.set(Message.MESSAGE_BCC_DISPLAY_NAME, pstMail.getDisplayBCC()); + metadata.set(Office.MAPI_IMPORTANCE, pstMail.getImportance()); + metadata.set(Office.MAPI_PRIORTY, pstMail.getPriority()); + metadata.set(Office.MAPI_IS_FLAGGED, pstMail.isFlagged()); + metadata.set(Office.MAPI_MESSAGE_CLASS, + OutlookExtractor.getMessageClass(pstMail.getMessageClass())); + + metadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress()); + + metadata.set(Office.MAPI_FROM_REPRESENTING_EMAIL, + pstMail.getSentRepresentingEmailAddress()); + + metadata.set(Message.MESSAGE_FROM_NAME, pstMail.getSenderName()); + metadata.set(Office.MAPI_FROM_REPRESENTING_NAME, pstMail.getSentRepresentingName()); + + //add recipient details + try { + for (int i = 0; i < pstMail.getNumberOfRecipients(); i++) { + PSTRecipient recipient = pstMail.getRecipient(i); + switch (OutlookExtractor.RECIPIENT_TYPE + .getTypeFromVal(recipient.getRecipientType())) { + case TO: + OutlookExtractor.addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, + recipient.getDisplayName(), metadata); + OutlookExtractor.addEvenIfNull(Message.MESSAGE_TO_EMAIL, + recipient.getEmailAddress(), metadata); + break; + case CC: + OutlookExtractor.addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, + recipient.getDisplayName(), metadata); + OutlookExtractor.addEvenIfNull(Message.MESSAGE_CC_EMAIL, + recipient.getEmailAddress(), metadata); + break; + case BCC: + OutlookExtractor.addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, + recipient.getDisplayName(), metadata); + OutlookExtractor.addEvenIfNull(Message.MESSAGE_BCC_EMAIL, + recipient.getEmailAddress(), metadata); + break; + default: + //do we want to handle unspecified or unknown? + break; + } + } + } catch (IOException | PSTException e) { + //swallow + } + + } + + private void parseMailAttachments(PSTMessage email, XHTMLContentHandler xhtml, + Metadata metadata, ParseContext context, + EmbeddedDocumentExtractor embeddedExtractor) + throws TikaException { + int numberOfAttachments = email.getNumberOfAttachments(); + for (int i = 0; i < numberOfAttachments; i++) { + try { + PSTAttachment attachment = email.getAttachment(i); + parseMailAttachment(xhtml, attachment, metadata, embeddedExtractor); + } catch (Exception e) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); + } + } + } + + private void parseMailAttachment(XHTMLContentHandler xhtml, PSTAttachment attachment, Metadata metadata, + EmbeddedDocumentExtractor embeddedExtractor) throws PSTException, IOException, + TikaException, SAXException { + + PSTMessage attachedEmail = attachment.getEmbeddedPSTMessage(); + attachment.getAttachMethod(); + //check for whether this is a binary attachment or an embedded pst msg + if (attachedEmail != null) { + try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { + tis.setOpenContainer(attachedEmail); + Metadata attachMetadata = new Metadata(); + attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); + attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, attachedEmail.getInternetMessageId()); + attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); + embeddedExtractor.parseEmbedded(tis, xhtml, attachMetadata, true); + } + return; + } + + // Get the filename; both long and short filenames can be used for attachments + String filename = attachment.getLongFilename(); + if (filename.isEmpty()) { + filename = attachment.getFilename(); + } + + xhtml.element("p", filename); + + Metadata attachMeta = new Metadata(); + attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); + attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename); + attachMeta.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", filename); + xhtml.startElement("div", attributes); + if (embeddedExtractor.shouldParseEmbedded(attachMeta)) { + TikaInputStream tis = null; + try { + tis = TikaInputStream.get(attachment.getFileInputStream()); + } catch (NullPointerException e) { //TIKA-2488 + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); + return; + } + + try { + embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, false); + } finally { + tis.close(); + } + } + xhtml.endElement("div"); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 621530848a..2096380b86 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -28,3 +28,4 @@ org.apache.tika.parser.microsoft.xml.WordMLParser org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser org.apache.tika.parser.microsoft.chm.ChmParser org.apache.tika.parser.microsoft.pst.OutlookPSTParser +org.apache.tika.parser.microsoft.pst.PSTMailItemParser diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index c95547aeee..c65a527586 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -17,31 +17,22 @@ package org.apache.tika.parser.microsoft.pst; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; import java.util.List; import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; import org.apache.tika.TikaTest; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.ToHTMLContentHandler; public class OutlookPSTParserTest extends TikaTest { @@ -54,52 +45,21 @@ public void testAccept() throws Exception { } @Test - public void testParse() throws Exception { - Metadata metadata = new Metadata(); - ContentHandler handler = new ToHTMLContentHandler(); - - ParseContext context = new ParseContext(); - EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context); - context.set(EmbeddedDocumentExtractor.class, trackingExtrator); - context.set(Parser.class, new AutoDetectParser()); - - AUTO_DETECT_PARSER - .parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, - context); - - String output = handler.toString(); - - assertFalse(output.isEmpty()); - assertTrue(output.contains("")); - assertTrue(output.contains( - "")); + public void testLegacyXML() throws Exception { + String output = getXML("testPST.pst").xml; + assertTrue(output.contains("

")); + assertTrue(output.contains("
" + "

Re: Feature Generators

")); assertTrue(output.contains( - "
" + - "

Re: Feature Generators

")); - assertTrue(output.contains( - "

Re: init tokenizer fails: \"Bad type in " + + "

Re: init tokenizer fails: \"Bad type in " + "putfield/putstatic\"

")); assertTrue(output.contains("Gary Murphy commented on TIKA-1250:")); - assertTrue( - output.contains("

Racine (pour la recherche)

")); + assertTrue(output.contains("

Racine (pour la recherche)

")); assertTrue(output.contains("This is a docx attachment.")); - - List metaList = trackingExtrator.trackingMetadata; - assertEquals(9, metaList.size()); - - Metadata firstMail = metaList.get(0); - assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR)); - assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE)); - assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress")); - assertEquals("users@opennlp.apache.org", firstMail.get("displayTo")); - assertEquals("", firstMail.get("displayCC")); - assertEquals("", firstMail.get("displayBCC")); - } @Test @@ -107,10 +67,16 @@ public void testExtendedMetadata() throws Exception { List metadataList = getRecursiveMetadata("testPST.pst"); Metadata m1 = metadataList.get(1); assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME)); + assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR)); + assertEquals("Re: Feature Generators", m1.get(TikaCoreProperties.TITLE)); + assertEquals("users@opennlp.apache.org", m1.get(Message.MESSAGE_TO_DISPLAY_NAME)); + assertEquals("", m1.get(Message.MESSAGE_CC_DISPLAY_NAME)); + assertEquals("", m1.get(Message.MESSAGE_BCC_DISPLAY_NAME)); assertEquals("kottmann@gmail.com", m1.get(Message.MESSAGE_FROM_EMAIL)); assertEquals("Jörn Kottmann", m1.get(Office.MAPI_FROM_REPRESENTING_NAME)); assertEquals("kottmann@gmail.com", m1.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS)); + assertEquals("/Début du fichier de données Outlook", m1.get(PST.PST_FOLDER_PATH)); Metadata m6 = metadataList.get(6); assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME)); @@ -118,6 +84,9 @@ public void testExtendedMetadata() throws Exception { assertEquals("Couchbase", m6.get(Office.MAPI_FROM_REPRESENTING_NAME)); assertEquals("couchbase@couchbase.com", m6.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS)); + assertNull(m1.get(Office.MAPI_RECIPIENTS_STRING)); + assertContains("2014-02-26", m1.get(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME)); + //test full EX email assertEquals( "/o=ExchangeLabs/ou=Exchange Administrative Group (FYDIBOHF23SPDLT)" + @@ -127,7 +96,11 @@ public void testExtendedMetadata() throws Exception { assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME)); assertEquals("couchbase@couchbase.com", m6.get(Message.MESSAGE_FROM_EMAIL)); - assertContains("2014-02-26", m1.get(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME)); + + Metadata m7 = metadataList.get(7); + assertEquals("/<2915856a7d3449e68529f3e61b8d26bc@pf.gov.br>/<3148510c2360443396a78d35e0888de9@pf.gov.br>/attachment.docx", + m7.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("/7/8/9", m7.get(TikaCoreProperties.EMBEDDED_ID_PATH)); } @Test @@ -145,24 +118,4 @@ public void testOverrideDetector() throws Exception { //TODO: figure out why the bold markup isn't coming through if we do extract then parse // the bodyhtml } - - private static class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor { - List trackingMetadata = new ArrayList<>(); - - public EmbeddedTrackingExtrator(ParseContext context) { - super(context); - } - - @Override - public boolean shouldParseEmbedded(Metadata metadata) { - return true; - } - - @Override - public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, - boolean outputHtml) throws SAXException, IOException { - this.trackingMetadata.add(metadata); - super.parseEmbedded(stream, handler, metadata, outputHtml); - } - } }