Skip to content

Commit 32baf23

Browse files
authored
TIKA-4250 -- add optional parser for pst files -- wrapper for libpst/readpst (#1751)
1 parent 2f8dbdf commit 32baf23

File tree

8 files changed

+580
-0
lines changed

8 files changed

+580
-0
lines changed

CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ Release 3.0.0-BETA2 - ???
55
* Updated PST parser to use standard Message metadata keys and improved
66
handling of embedded files (TIKA-4248).
77

8+
Other Changes
9+
10+
* Add optional PST parser based on libpst/readpst (TIKA-4250).
11+
812
Release 3.0.0-BETA - 12/01/2023
913

1014
BREAKING CHANGES

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,13 @@
116116
<artifactId>log4j-slf4j2-impl</artifactId>
117117
<scope>test</scope>
118118
</dependency>
119+
<!-- needed for libpst test files -->
120+
<dependency>
121+
<groupId>${project.groupId}</groupId>
122+
<artifactId>tika-parser-mail-module</artifactId>
123+
<version>${project.version}</version>
124+
<scope>test</scope>
125+
</dependency>
119126
</dependencies>
120127
<build>
121128
<plugins>
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.parser.microsoft.libpst;
18+
19+
import java.io.IOException;
20+
import java.io.InputStream;
21+
import java.nio.file.FileVisitResult;
22+
import java.nio.file.FileVisitor;
23+
import java.nio.file.Path;
24+
import java.nio.file.attribute.BasicFileAttributes;
25+
26+
import org.apache.commons.io.IOExceptionWithCause;
27+
import org.xml.sax.SAXException;
28+
29+
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
30+
import org.apache.tika.extractor.EmbeddedDocumentUtil;
31+
import org.apache.tika.io.TikaInputStream;
32+
import org.apache.tika.metadata.Metadata;
33+
import org.apache.tika.metadata.PST;
34+
import org.apache.tika.parser.ParseContext;
35+
import org.apache.tika.sax.XHTMLContentHandler;
36+
37+
public class EmailVisitor implements FileVisitor<Path> {
38+
39+
private final Path root;
40+
private final boolean processEmailAsMsg;
41+
private final XHTMLContentHandler xhtml;
42+
private final Metadata parentMetadata;
43+
private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
44+
45+
public EmailVisitor(Path root, boolean processEmailAsMsg, XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext parseContext) {
46+
this.root = root;
47+
this.processEmailAsMsg = processEmailAsMsg;
48+
this.xhtml = xhtml;
49+
this.parentMetadata = parentMetadata;
50+
this.embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
51+
}
52+
53+
@Override
54+
public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
55+
return FileVisitResult.CONTINUE;
56+
}
57+
58+
@Override
59+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
60+
if (processEmailAsMsg) {
61+
if (file
62+
.getFileName()
63+
.toString()
64+
.endsWith(".msg")) {
65+
process(file);
66+
}
67+
} else if (file
68+
.getFileName()
69+
.toString()
70+
.endsWith(".eml")) {
71+
process(file);
72+
}
73+
return FileVisitResult.CONTINUE;
74+
}
75+
76+
private void process(Path file) throws IOException {
77+
Metadata emailMetadata = new Metadata();
78+
String pstPath = root
79+
.relativize(file.getParent())
80+
.toString();
81+
emailMetadata.set(PST.PST_FOLDER_PATH, pstPath);
82+
try (InputStream is = TikaInputStream.get(file)) {
83+
try {
84+
embeddedDocumentExtractor.parseEmbedded(is, xhtml, emailMetadata, true);
85+
} catch (SAXException e) {
86+
throw new IOExceptionWithCause(e);
87+
}
88+
}
89+
}
90+
91+
@Override
92+
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
93+
return FileVisitResult.CONTINUE;
94+
}
95+
96+
@Override
97+
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
98+
return FileVisitResult.CONTINUE;
99+
}
100+
}
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.parser.microsoft.libpst;
18+
19+
import java.io.IOException;
20+
import java.io.InputStream;
21+
import java.nio.file.Files;
22+
import java.nio.file.Path;
23+
import java.util.ArrayList;
24+
import java.util.List;
25+
import java.util.Map;
26+
import java.util.Set;
27+
28+
import org.apache.commons.io.FileUtils;
29+
import org.apache.commons.io.IOUtils;
30+
import org.slf4j.Logger;
31+
import org.slf4j.LoggerFactory;
32+
import org.xml.sax.ContentHandler;
33+
import org.xml.sax.SAXException;
34+
35+
import org.apache.tika.config.Field;
36+
import org.apache.tika.config.Initializable;
37+
import org.apache.tika.config.InitializableProblemHandler;
38+
import org.apache.tika.config.Param;
39+
import org.apache.tika.exception.TikaConfigException;
40+
import org.apache.tika.exception.TikaException;
41+
import org.apache.tika.io.TemporaryResources;
42+
import org.apache.tika.io.TikaInputStream;
43+
import org.apache.tika.metadata.Metadata;
44+
import org.apache.tika.mime.MediaType;
45+
import org.apache.tika.parser.ParseContext;
46+
import org.apache.tika.parser.Parser;
47+
import org.apache.tika.sax.XHTMLContentHandler;
48+
import org.apache.tika.utils.FileProcessResult;
49+
import org.apache.tika.utils.ProcessUtils;
50+
51+
/**
52+
* This is an optional PST parser that relies on the user installing
53+
* the GPL-3 libpst/readpst commandline tool and configuring
54+
* Tika to call this library via tika-config.xml
55+
*/
56+
public class LibPstParser implements Parser, Initializable {
57+
58+
public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
59+
60+
private static final Set<MediaType> SUPPORTED = Set.of(MS_OUTLOOK_PST_MIMETYPE);
61+
62+
private static final Logger LOGGER = LoggerFactory.getLogger(LibPstParser.class);
63+
64+
private static final int MAX_STDOUT = 100000;
65+
private static final int MAX_STDERR = 10000;
66+
private static final String READ_PST_COMMAND = "readpst";
67+
68+
private LibPstParserConfig defaultConfig = new LibPstParserConfig();
69+
70+
@Override
71+
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
72+
return SUPPORTED;
73+
}
74+
75+
@Override
76+
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
77+
TikaInputStream tis = TikaInputStream.cast(inputStream);
78+
TemporaryResources tmp = null;
79+
if (tis == null) {
80+
tmp = new TemporaryResources();
81+
tis = TikaInputStream.get(inputStream, tmp, metadata);
82+
}
83+
try {
84+
_parse(tis.getPath(), contentHandler, metadata, parseContext);
85+
} finally {
86+
IOUtils.closeQuietly(tmp);
87+
}
88+
}
89+
90+
private void _parse(Path pst, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws TikaException, IOException, SAXException {
91+
LibPstParserConfig activeConfig = parseContext.get(LibPstParserConfig.class, defaultConfig);
92+
Path outDir = Files.createTempDirectory("libpst-");
93+
Path debugFile = activeConfig.isDebug() ? Files.createTempFile("tika-libpst-debug", ".txt") : null;
94+
try {
95+
ProcessBuilder pb = getProcessBuilder(pst, activeConfig, outDir, debugFile);
96+
XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata);
97+
FileProcessResult fileProcessResult = ProcessUtils.execute(pb, activeConfig.getTimeoutSeconds() * 1000l, MAX_STDOUT, MAX_STDERR);
98+
xhtml.startDocument();
99+
processContents(outDir, activeConfig, xhtml, metadata, parseContext);
100+
if (fileProcessResult.isTimeout()) {
101+
throw new TikaException("Timeout exception: " + fileProcessResult.getProcessTimeMillis());
102+
}
103+
if (fileProcessResult.getExitValue() != 0) {
104+
LOGGER.warn("libpst bad exit value {}: {}", fileProcessResult.getExitValue(), fileProcessResult.getStderr());
105+
throw new TikaException("Bad exit value: " + fileProcessResult.getExitValue());
106+
}
107+
xhtml.endDocument();
108+
} finally {
109+
try {
110+
FileUtils.deleteDirectory(outDir.toFile());
111+
} catch (IOException e) {
112+
LOGGER.warn("Couldn't delete temporary directory: " + outDir.toAbsolutePath(), e);
113+
}
114+
try {
115+
if (debugFile != null) {
116+
Files.delete(debugFile);
117+
}
118+
} catch (IOException e) {
119+
LOGGER.warn("Couldn't delete debug file?!", e);
120+
}
121+
}
122+
}
123+
124+
private void processContents(Path outDir, LibPstParserConfig config, XHTMLContentHandler xhtml, Metadata metadata, ParseContext parseContext) throws IOException {
125+
Files.walkFileTree(outDir, new EmailVisitor(outDir, config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
126+
}
127+
128+
private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile) {
129+
List commands = new ArrayList<String>();
130+
commands.add(READ_PST_COMMAND);
131+
if (config.isDebug()) {
132+
commands.add("-d");
133+
commands.add(ProcessUtils.escapeCommandLine(debugFile
134+
.toAbsolutePath()
135+
.toString()));
136+
}
137+
if (config.isIncludeDeleted()) {
138+
commands.add("-D");
139+
}
140+
if (config.isProcessEmailAsMsg()) {
141+
commands.add("-m");
142+
} else {
143+
//include .eml and include extensions
144+
commands.add("-e");
145+
}
146+
commands.add("-o");
147+
commands.add(ProcessUtils.escapeCommandLine(outDir
148+
.toAbsolutePath()
149+
.toString()));
150+
151+
commands.add(ProcessUtils.escapeCommandLine(pst
152+
.toAbsolutePath()
153+
.toString()));
154+
LOGGER.debug("command arguments: " + commands);
155+
return new ProcessBuilder(commands);
156+
}
157+
158+
@Override
159+
public void initialize(Map<String, Param> map) throws TikaConfigException {
160+
try {
161+
check();
162+
} catch (IOException e) {
163+
LOGGER.error("Couldn't get version of libpst", e);
164+
throw new TikaConfigException("Unable to check version of readpst. Is it installed?!", e);
165+
}
166+
}
167+
168+
@Override
169+
public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException {
170+
171+
}
172+
173+
//throws exception if readpst is not available
174+
private static void check() throws TikaConfigException, IOException {
175+
ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V");
176+
FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 10000);
177+
if (result.getExitValue() != 0) {
178+
throw new TikaConfigException(
179+
"bad exit value for LibPstParser. It must be installed and on the path" + " if this parser is configured. Exit value: " + result.getExitValue());
180+
}
181+
if (result.isTimeout()) {
182+
throw new TikaConfigException("timeout trying to get version from readpst?!");
183+
}
184+
}
185+
186+
public static boolean checkQuietly() {
187+
try {
188+
check();
189+
} catch (TikaConfigException | IOException e) {
190+
return false;
191+
}
192+
return true;
193+
}
194+
195+
@Field
196+
public void setTimeoutSeconds(long timeoutSeconds) {
197+
defaultConfig.setTimeoutSeconds(timeoutSeconds);
198+
}
199+
200+
@Field
201+
public void setProcessEmailAsMsg(boolean processEmailAsMsg) {
202+
defaultConfig.setProcessEmailAsMsg(processEmailAsMsg);
203+
}
204+
205+
@Field
206+
public void setIncludeDeleted(boolean includeDeleted) {
207+
defaultConfig.setIncludeDeleted(includeDeleted);
208+
}
209+
210+
@Field
211+
public void setMaxEmails(int maxEmails) {
212+
defaultConfig.setMaxEmails(maxEmails);
213+
}
214+
215+
216+
}

0 commit comments

Comments
 (0)