| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.mp3; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.PushbackInputStream; |
| import java.io.UnsupportedEncodingException; |
| import java.util.Iterator; |
| |
| import org.apache.tika.parser.mp3.ID3Tags.ID3Comment; |
| |
| import static java.nio.charset.StandardCharsets.ISO_8859_1; |
| |
| /** |
| * A frame of ID3v2 data, which is then passed to a handler to |
| * be turned into useful data. |
| */ |
| public class ID3v2Frame implements MP3Frame { |
| |
| private static int MAX_RECORD_SIZE = 50_000_000; |
| |
| private int majorVersion; |
| private int minorVersion; |
| private int flags; |
| private int length; |
| /** Excludes the header size part */ |
| private byte[] extendedHeader; |
| private byte[] data; |
| |
| public static void setMaxRecordSize(int maxRecordSize) { |
| MAX_RECORD_SIZE = maxRecordSize; |
| } |
| |
| public int getMajorVersion() { |
| return majorVersion; |
| } |
| |
| public int getMinorVersion() { |
| return minorVersion; |
| } |
| |
| public int getFlags() { |
| return flags; |
| } |
| |
| public int getLength() { |
| return length; |
| } |
| |
| public byte[] getExtendedHeader() { |
| return extendedHeader; |
| } |
| |
| public byte[] getData() { |
| return data; |
| } |
| |
| /** |
| * Returns the next ID3v2 Frame in |
| * the file, or null if the next batch of data |
| * doesn't correspond to either an ID3v2 header. |
| * If no ID3v2 frame could be detected and the passed in input stream is a |
| * {@code PushbackInputStream}, the bytes read so far are pushed back so |
| * that they can be read again. |
| * ID3v2 Frames should come before all Audio ones. |
| */ |
| public static MP3Frame createFrameIfPresent(InputStream inp) |
| throws IOException { |
| int h1 = inp.read(); |
| int h2 = inp.read(); |
| int h3 = inp.read(); |
| |
| // Is it an ID3v2 Frame? |
| if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') { |
| int majorVersion = inp.read(); |
| int minorVersion = inp.read(); |
| if (majorVersion == -1 || minorVersion == -1) { |
| pushBack(inp, h1, h2, h3, majorVersion, minorVersion); |
| return null; |
| } |
| return new ID3v2Frame(majorVersion, minorVersion, inp); |
| } |
| |
| // Not a frame header |
| pushBack(inp, h1, h2, h3); |
| return null; |
| } |
| |
| /** |
| * Pushes bytes back into the stream if possible. This method is called if |
| * no ID3v2 header could be found at the current stream position. |
| * |
| * @param inp the input stream |
| * @param bytes the bytes to be pushed back |
| * @throws IOException if an error occurs |
| */ |
| private static void pushBack(InputStream inp, int... bytes) |
| throws IOException |
| { |
| if (inp instanceof PushbackInputStream) |
| { |
| byte[] buf = new byte[bytes.length]; |
| for (int i = 0; i < bytes.length; i++) |
| { |
| buf[i] = (byte) bytes[i]; |
| } |
| ((PushbackInputStream) inp).unread(buf); |
| } |
| } |
| |
| private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp) |
| throws IOException { |
| this.majorVersion = majorVersion; |
| this.minorVersion = minorVersion; |
| |
| // Get the flags and the length |
| flags = inp.read(); |
| length = get7BitsInt(readFully(inp, 4), 0); |
| |
| // Do we have an extended header? |
| if ((flags & 0x02) == 0x02) { |
| int size = getInt(readFully(inp, 4)); |
| extendedHeader = readFully(inp, size); |
| } |
| |
| // Get the frame's data, or at least as much |
| // of it as we could do |
| data = readFully(inp, length, false); |
| } |
| |
| protected static int getInt(byte[] data) { |
| return getInt(data, 0); |
| } |
| |
| protected static int getInt(byte[] data, int offset) { |
| int b0 = data[offset+0] & 0xFF; |
| int b1 = data[offset+1] & 0xFF; |
| int b2 = data[offset+2] & 0xFF; |
| int b3 = data[offset+3] & 0xFF; |
| return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0); |
| } |
| |
| protected static int getInt3(byte[] data, int offset) { |
| int b0 = data[offset+0] & 0xFF; |
| int b1 = data[offset+1] & 0xFF; |
| int b2 = data[offset+2] & 0xFF; |
| return (b0 << 16) + (b1 << 8) + (b2 << 0); |
| } |
| |
| protected static int getInt2(byte[] data, int offset) { |
| int b0 = data[offset+0] & 0xFF; |
| int b1 = data[offset+1] & 0xFF; |
| return (b0 << 8) + (b1 << 0); |
| } |
| |
| /** |
| * AKA a Synchsafe integer. |
| * 4 bytes hold a 28 bit number. The highest |
| * bit in each byte is always 0 and always ignored. |
| */ |
| protected static int get7BitsInt(byte[] data, int offset) { |
| int b0 = data[offset+0] & 0x7F; |
| int b1 = data[offset+1] & 0x7F; |
| int b2 = data[offset+2] & 0x7F; |
| int b3 = data[offset+3] & 0x7F; |
| return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0); |
| } |
| |
| protected static byte[] readFully(InputStream inp, int length) |
| throws IOException { |
| return readFully(inp, length, true); |
| } |
| |
| protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal) |
| throws IOException { |
| if (MAX_RECORD_SIZE > 0 && length > MAX_RECORD_SIZE) { |
| throw new IOException("Record size ("+length+ |
| " bytes) is larger than the allowed record size: "+MAX_RECORD_SIZE); |
| } |
| byte[] b = new byte[length]; |
| |
| int pos = 0; |
| int read; |
| while (pos < length) { |
| read = inp.read(b, pos, length-pos); |
| if (read == -1) { |
| if(shortDataIsFatal) { |
| throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present"); |
| } else { |
| // Give them what we found |
| // TODO Log the short read |
| return b; |
| } |
| } |
| pos += read; |
| } |
| |
| return b; |
| } |
| |
| protected static class TextEncoding { |
| public final boolean doubleByte; |
| public final String encoding; |
| private TextEncoding(String encoding, boolean doubleByte) { |
| this.doubleByte = doubleByte; |
| this.encoding = encoding; |
| } |
| } |
| protected static final TextEncoding[] encodings = new TextEncoding[] { |
| new TextEncoding("ISO-8859-1", false), |
| new TextEncoding("UTF-16", true), // With BOM |
| new TextEncoding("UTF-16BE", true), // Without BOM |
| new TextEncoding("UTF-8", false) |
| }; |
| |
| /** |
| * Returns the (possibly null padded) String at the given offset and |
| * length. String encoding is held in the first byte; |
| */ |
| protected static String getTagString(byte[] data, int offset, int length) { |
| int actualLength = length; |
| if (actualLength == 0) { |
| return ""; |
| } |
| if (actualLength == 1 && data[offset] == 0) { |
| return ""; |
| } |
| |
| // Does it have an encoding flag? |
| // Detect by the first byte being sub 0x20 |
| TextEncoding encoding = encodings[0]; |
| byte maybeEncodingFlag = data[offset]; |
| if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) { |
| offset++; |
| actualLength--; |
| encoding = encodings[maybeEncodingFlag]; |
| } |
| |
| // Trim off null termination / padding (as present) |
| while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) { |
| actualLength -= 2; |
| } |
| while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) { |
| actualLength--; |
| } |
| if (actualLength == 0) { |
| return ""; |
| } |
| |
| // TIKA-1024: If it's UTF-16 (with BOM) and all we |
| // have is a naked BOM then short-circuit here |
| // (return empty string), because new String(..) |
| // gives different results on different JVMs |
| if (encoding.encoding.equals("UTF-16") && actualLength == 2 && |
| ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) || |
| (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) { |
| return ""; |
| } |
| |
| try { |
| // Build the base string |
| return new String(data, offset, actualLength, encoding.encoding); |
| } catch (UnsupportedEncodingException e) { |
| throw new RuntimeException( |
| "Core encoding " + encoding.encoding + " is not available", e); |
| } |
| } |
| /** |
| * Builds up the ID3 comment, by parsing and extracting |
| * the comment string parts from the given data. |
| */ |
| protected static ID3Comment getComment(byte[] data, int offset, int length) { |
| // Comments must have an encoding |
| int encodingFlag = data[offset]; |
| if (encodingFlag >= 0 && encodingFlag < encodings.length) { |
| // Good, valid flag |
| } else { |
| // Invalid string |
| return null; |
| } |
| |
| TextEncoding encoding = encodings[encodingFlag]; |
| |
| // First is a 3 byte language |
| String lang = getString(data, offset+1, 3); |
| |
| // After that we have [Desc]\0(\0)[Text] |
| int descStart = offset+4; |
| int textStart = -1; |
| String description = null; |
| String text = null; |
| |
| // Find where the description ends |
| try { |
| for (int i=descStart; i<offset+length; i++) { |
| if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) { |
| // Handle LE vs BE on low byte text |
| if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) { |
| i++; |
| } |
| textStart = i+2; |
| description = new String(data, descStart, i-descStart, encoding.encoding); |
| break; |
| } |
| if (!encoding.doubleByte && data[i]==0) { |
| textStart = i+1; |
| description = new String(data, descStart, i-descStart, encoding.encoding); |
| break; |
| } |
| } |
| |
| // Did we find the end? |
| if (textStart > -1) { |
| text = new String(data, textStart, offset+length-textStart, encoding.encoding); |
| } else { |
| // Assume everything is the text |
| text = new String(data, descStart, offset+length-descStart, encoding.encoding); |
| } |
| |
| // Return |
| return new ID3Comment(lang, description, text); |
| } catch (UnsupportedEncodingException e) { |
| throw new RuntimeException( |
| "Core encoding " + encoding.encoding + " is not available", e); |
| } |
| } |
| |
| /** |
| * Returns the String at the given |
| * offset and length. Strings are ISO-8859-1 |
| */ |
| protected static String getString(byte[] data, int offset, int length) { |
| return new String(data, offset, length, ISO_8859_1); |
| } |
| |
| |
| /** |
| * Iterates over id3v2 raw tags. |
| * Create an instance of this that configures the |
| * various length and multipliers. |
| */ |
| protected class RawTagIterator implements Iterator<RawTag> { |
| private int nameLength; |
| private int sizeLength; |
| private int sizeMultiplier; |
| private int flagLength; |
| |
| private int offset = 0; |
| |
| protected RawTagIterator( |
| int nameLength, int sizeLength, int sizeMultiplier, |
| int flagLength) { |
| this.nameLength = nameLength; |
| this.sizeLength = sizeLength; |
| this.sizeMultiplier = sizeMultiplier; |
| this.flagLength = flagLength; |
| } |
| |
| public boolean hasNext() { |
| // Check for padding at the end |
| return offset < data.length && data[offset] != 0; |
| } |
| |
| public RawTag next() { |
| RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier, |
| flagLength, data, offset); |
| offset += tag.getSize(); |
| return tag; |
| } |
| |
| public void remove() { |
| } |
| |
| } |
| |
| protected static class RawTag { |
| private int headerSize; |
| protected String name; |
| protected int flag; |
| protected byte[] data; |
| |
| private RawTag( |
| int nameLength, int sizeLength, int sizeMultiplier, |
| int flagLength, byte[] frameData, int offset) { |
| headerSize = nameLength + sizeLength + flagLength; |
| |
| // Name, normally 3 or 4 bytes |
| name = getString(frameData, offset, nameLength); |
| |
| // Size |
| int rawSize; |
| if (sizeLength == 3) { |
| rawSize = getInt3(frameData, offset+nameLength); |
| } else { |
| rawSize = getInt(frameData, offset+nameLength); |
| } |
| int size = rawSize * sizeMultiplier; |
| |
| // Flag |
| if (flagLength > 0) { |
| if (flagLength == 1) { |
| flag = (int)frameData[offset+nameLength+sizeLength]; |
| } else { |
| flag = getInt2(frameData, offset+nameLength+sizeLength); |
| } |
| } |
| |
| // Now data |
| int copyFrom = offset+nameLength+sizeLength+flagLength; |
| size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files. |
| data = new byte[size]; |
| System.arraycopy(frameData, copyFrom, data, 0, size); |
| } |
| |
| protected int getSize() { |
| return headerSize + data.length; |
| } |
| |
| } |
| |
| } |