/**************************************************************** | |
* Licensed to the Apache Software Foundation (ASF) under one * | |
* or more contributor license agreements. See the NOTICE file * | |
* distributed with this work for additional information * | |
* regarding copyright ownership. The ASF licenses this file * | |
* to you under the Apache License, Version 2.0 (the * | |
* "License"); you may not use this file except in compliance * | |
* with the License. You may obtain a copy of the License at * | |
* * | |
* http://www.apache.org/licenses/LICENSE-2.0 * | |
* * | |
* Unless required by applicable law or agreed to in writing, * | |
* software distributed under the License is distributed on an * | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * | |
* KIND, either express or implied. See the License for the * | |
* specific language governing permissions and limitations * | |
* under the License. * | |
****************************************************************/ | |
package org.apache.james.mime4j.parser; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.io.Reader; | |
import java.nio.charset.Charset; | |
import java.nio.charset.IllegalCharsetNameException; | |
import java.nio.charset.UnsupportedCharsetException; | |
import java.util.LinkedList; | |
import org.apache.james.mime4j.BodyDescriptor; | |
import org.apache.james.mime4j.MimeException; | |
import org.apache.james.mime4j.decoder.Base64InputStream; | |
import org.apache.james.mime4j.decoder.QuotedPrintableInputStream; | |
import org.apache.james.mime4j.stream.BasicBufferingInputStream; | |
import org.apache.james.mime4j.stream.RootInputStream; | |
import org.apache.james.mime4j.util.InputBuffer; | |
import org.apache.james.mime4j.util.MimeUtil; | |
/** | |
* <p> | |
* Parses MIME (or RFC822) message streams of bytes or characters. | |
* The stream is converted into an event stream. | |
* <p> | |
* <p> | |
* Typical usage: | |
* </p> | |
* <pre> | |
* MimeTokenStream stream = new MimeTokenStream(); | |
* stream.parse(new FileInputStream("mime.msg")); | |
* for (int state = stream.getState(); | |
* state != MimeTokenStream.T_END_OF_STREAM; | |
* state = stream.next()) { | |
* switch (state) { | |
* case MimeTokenStream.T_BODY: | |
* System.out.println("Body detected, contents = " | |
* + stream.getInputStream() + ", header data = " | |
* + stream.getBodyDescriptor()); | |
* break; | |
* case MimeTokenStream.T_FIELD: | |
* System.out.println("Header field detected: " | |
* + stream.getField()); | |
* break; | |
* case MimeTokenStream.T_START_MULTIPART: | |
* System.out.println("Multipart message detexted," | |
* + " header data = " | |
* + stream.getBodyDescriptor()); | |
* ... | |
* } | |
* } | |
* </pre> | |
* <p>Instances of {@link MimeTokenStream} are reusable: Invoking the | |
* method {@link #parse(InputStream)} resets the token streams internal | |
* state. However, they are definitely <em>not</em> thread safe. If you | |
* have a multi threaded application, then the suggested use is to have | |
* one instance per thread.</p> | |
* | |
* @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $ | |
*/ | |
public class MimeTokenStream implements EntityStates, RecursionMode { | |
/** | |
* Creates a stream that creates a more detailed body descriptor. | |
* @return <code>MimeTokenStream</code>, not null | |
*/ | |
public static final MimeTokenStream createMaximalDescriptorStream() { | |
return new MimeTokenStream(false, true); | |
} | |
/** | |
* Creates a stream that strictly validates the input. | |
* @return <code>MimeTokenStream</code> which throws a | |
* <code>MimeException</code> whenever possible issues | |
* are dedicated in the input | |
*/ | |
public static final MimeTokenStream createStrictValidationStream() { | |
return new MimeTokenStream(true, false); | |
} | |
private final boolean strictParsing; | |
private final boolean maximalBodyDescriptor; | |
private final LinkedList entities = new LinkedList(); | |
private int state = T_END_OF_STREAM; | |
private EntityStateMachine currentStateMachine; | |
private int recursionMode = M_RECURSE; | |
private InputBuffer inbuffer; | |
private RootInputStream rootInputStream; | |
/** | |
* Constructs a standard (lax) stream. | |
* Optional validation events will be logged only. | |
* Use {@link #createStrictValidationStream()} to create | |
* a stream that strictly validates the input. | |
*/ | |
public MimeTokenStream() { | |
this(false, false); | |
} | |
protected MimeTokenStream(final boolean strictParsing, final boolean maximalBodyDescriptor) { | |
this.strictParsing = strictParsing; | |
this.maximalBodyDescriptor = maximalBodyDescriptor; | |
} | |
/** Instructs the {@code MimeTokenStream} to parse the given streams contents. | |
* If the {@code MimeTokenStream} has already been in use, resets the streams | |
* internal state. | |
*/ | |
public void parse(InputStream stream) { | |
doParse(stream, null); | |
} | |
/** Instructs the {@code MimeTokenStream} to parse the given content with | |
* the content type. The message stream is assumed to have no message header | |
* and is expected to begin with a message body. This can be the case when | |
* the message content is transmitted using a different transport protocol | |
* such as HTTP. | |
* <p/> | |
* If the {@code MimeTokenStream} has already been in use, resets the streams | |
* internal state. | |
*/ | |
public void parseHeadless(InputStream stream, String contentType) { | |
if (contentType == null) { | |
throw new IllegalArgumentException("Content type may not be null"); | |
} | |
doParse(stream, contentType); | |
} | |
private void doParse(InputStream stream, String contentType) { | |
entities.clear(); | |
rootInputStream = new RootInputStream(stream); | |
inbuffer = new InputBuffer(rootInputStream, 4 * 1024); | |
switch (recursionMode) { | |
case M_RAW: | |
RawEntity rawentity = new RawEntity(new BasicBufferingInputStream(inbuffer)); | |
currentStateMachine = rawentity; | |
break; | |
case M_NO_RECURSE: | |
case M_FLAT: | |
// expected to be called only at start of paring | |
case M_RECURSE: | |
MimeEntity mimeentity = new MimeEntity( | |
rootInputStream, | |
new BasicBufferingInputStream(inbuffer), | |
inbuffer, | |
null, | |
T_START_MESSAGE, | |
T_END_MESSAGE, | |
maximalBodyDescriptor, | |
strictParsing); | |
mimeentity.setRecursionMode(recursionMode); | |
if (contentType != null) { | |
mimeentity.skipHeader(contentType); | |
} | |
currentStateMachine = mimeentity; | |
break; | |
} | |
entities.add(currentStateMachine); | |
state = currentStateMachine.getState(); | |
} | |
/** | |
* Determines if this parser is currently in raw mode. | |
* | |
* @return <code>true</code> if in raw mode, <code>false</code> | |
* otherwise. | |
* @see #setRaw(boolean) | |
*/ | |
public boolean isRaw() { | |
return recursionMode == M_RAW; | |
} | |
/** | |
* Gets the current recursion mode. | |
* The recursion mode specifies the approach taken to parsing parts. | |
* {@link #M_RAW} mode does not parse the part at all. | |
* {@link #M_RECURSE} mode recursively parses each mail | |
* when an <code>message/rfc822</code> part is encounted; | |
* {@link #M_NO_RECURSE} does not. | |
* @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE} | |
*/ | |
public int getRecursionMode() { | |
return recursionMode; | |
} | |
/** | |
* Sets the current recursion. | |
* The recursion mode specifies the approach taken to parsing parts. | |
* {@link #M_RAW} mode does not parse the part at all. | |
* {@link #M_RECURSE} mode recursively parses each mail | |
* when an <code>message/rfc822</code> part is encounted; | |
* {@link #M_NO_RECURSE} does not. | |
* @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE} | |
*/ | |
public void setRecursionMode(int mode) { | |
recursionMode = mode; | |
if (currentStateMachine != null) { | |
currentStateMachine.setRecursionMode(mode); | |
} | |
} | |
/** | |
* Finishes the parsing and stops reading lines. | |
* NOTE: No more lines will be parsed but the parser | |
* will still call | |
* {@link ContentHandler#endMultipart()}, | |
* {@link ContentHandler#endBodyPart()}, | |
* {@link ContentHandler#endMessage()}, etc to match previous calls | |
* to | |
* {@link ContentHandler#startMultipart(BodyDescriptor)}, | |
* {@link ContentHandler#startBodyPart()}, | |
* {@link ContentHandler#startMessage()}, etc. | |
*/ | |
public void stop() { | |
inbuffer.clear(); | |
rootInputStream.truncate(); | |
} | |
/** | |
* Returns the current state. | |
*/ | |
public int getState() { | |
return state; | |
} | |
/** | |
* This method returns the raw entity, preamble, or epilogue contents. | |
* <p/> | |
* This method is valid, if {@link #getState()} returns either of | |
* {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}. | |
* | |
* @return Data stream, depending on the current state. | |
* @throws IllegalStateException {@link #getState()} returns an | |
* invalid value. | |
*/ | |
public InputStream getInputStream() { | |
return currentStateMachine.getContentStream(); | |
} | |
/** | |
* This method returns a transfer decoded stream based on the MIME | |
* fields with the standard defaults. | |
* <p/> | |
* This method is valid, if {@link #getState()} returns either of | |
* {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}. | |
* | |
* @return Data stream, depending on the current state. | |
* @throws IllegalStateException {@link #getState()} returns an | |
* invalid value. | |
*/ | |
public InputStream getDecodedInputStream() { | |
BodyDescriptor bodyDescriptor = getBodyDescriptor(); | |
String transferEncoding = bodyDescriptor.getTransferEncoding(); | |
InputStream dataStream = currentStateMachine.getContentStream(); | |
if (MimeUtil.isBase64Encoding(transferEncoding)) { | |
dataStream = new Base64InputStream(dataStream); | |
} else if (MimeUtil.isQuotedPrintableEncoded(transferEncoding)) { | |
dataStream = new QuotedPrintableInputStream(dataStream); | |
} | |
return dataStream; | |
} | |
/** | |
* Gets a reader configured for the current body or body part. | |
* The reader will return a transfer and charset decoded | |
* stream of characters based on the MIME fields with the standard | |
* defaults. | |
* This is a conveniance method and relies on {@link #getInputStream()}. | |
* Consult the javadoc for that method for known limitations. | |
* | |
* @return <code>Reader</code>, not null | |
* @see #getInputStream | |
* @throws IllegalStateException {@link #getState()} returns an | |
* invalid value | |
* @throws UnsupportedCharsetException if there is no JVM support | |
* for decoding the charset | |
* @throws IllegalCharsetNameException if the charset name specified | |
* in the mime type is illegal | |
*/ | |
public Reader getReader() { | |
final BodyDescriptor bodyDescriptor = getBodyDescriptor(); | |
final String mimeCharset = bodyDescriptor.getCharset(); | |
final Charset charset; | |
if (mimeCharset == null || "".equals(mimeCharset)) { | |
charset = Charset.forName("US-ASCII"); | |
} else { | |
charset = Charset.forName(mimeCharset); | |
} | |
final InputStream instream = getDecodedInputStream(); | |
return new InputStreamReader(instream, charset); | |
} | |
/** | |
* <p>Gets a descriptor for the current entity. | |
* This method is valid if {@link #getState()} returns:</p> | |
* <ul> | |
* <li>{@link #T_BODY}</li> | |
* <li>{@link #T_START_MULTIPART}</li> | |
* <li>{@link #T_EPILOGUE}</li> | |
* <li>{@link #T_PREAMBLE}</li> | |
* </ul> | |
* @return <code>BodyDescriptor</code>, not nulls | |
*/ | |
public BodyDescriptor getBodyDescriptor() { | |
return currentStateMachine.getBodyDescriptor(); | |
} | |
/** | |
* This method is valid, if {@link #getState()} returns {@link #T_FIELD}. | |
* @return String with the fields raw contents. | |
* @throws IllegalStateException {@link #getState()} returns another | |
* value than {@link #T_FIELD}. | |
*/ | |
public String getField() { | |
return currentStateMachine.getField(); | |
} | |
/** | |
* This method is valid, if {@link #getState()} returns {@link #T_FIELD}. | |
* @return String with the fields name. | |
* @throws IllegalStateException {@link #getState()} returns another | |
* value than {@link #T_FIELD}. | |
*/ | |
public String getFieldName() { | |
return currentStateMachine.getFieldName(); | |
} | |
/** | |
* This method is valid, if {@link #getState()} returns {@link #T_FIELD}. | |
* @return String with the fields value. | |
* @throws IllegalStateException {@link #getState()} returns another | |
* value than {@link #T_FIELD}. | |
*/ | |
public String getFieldValue() { | |
return currentStateMachine.getFieldValue(); | |
} | |
/** | |
* This method advances the token stream to the next token. | |
* @throws IllegalStateException The method has been called, although | |
* {@link #getState()} was already {@link #T_END_OF_STREAM}. | |
*/ | |
public int next() throws IOException, MimeException { | |
if (state == T_END_OF_STREAM || currentStateMachine == null) { | |
throw new IllegalStateException("No more tokens are available."); | |
} | |
while (currentStateMachine != null) { | |
EntityStateMachine next = currentStateMachine.advance(); | |
if (next != null) { | |
entities.add(next); | |
currentStateMachine = next; | |
} | |
state = currentStateMachine.getState(); | |
if (state != T_END_OF_STREAM) { | |
return state; | |
} | |
entities.removeLast(); | |
if (entities.isEmpty()) { | |
currentStateMachine = null; | |
} else { | |
currentStateMachine = (EntityStateMachine) entities.getLast(); | |
currentStateMachine.setRecursionMode(recursionMode); | |
} | |
} | |
state = T_END_OF_STREAM; | |
return state; | |
} | |
/** | |
* Renders a state as a string suitable for logging. | |
* @param state | |
* @return rendered as string, not null | |
*/ | |
public static final String stateToString(int state) { | |
return AbstractEntity.stateToString(state); | |
} | |
} |