| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.iptc; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.text.ParseException; |
| import java.text.SimpleDateFormat; |
| import java.util.Collections; |
| import java.util.Date; |
| import java.util.HashMap; |
| import java.util.Locale; |
| import java.util.Set; |
| import java.util.TimeZone; |
| |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Parser for IPTC ANPA New Wire Feeds |
| */ |
| public class IptcAnpaParser implements Parser { |
| /** |
| * Serial version UID |
| */ |
| private static final long serialVersionUID = -6062820170212879115L; |
| |
| private static final MediaType TYPE = MediaType.text("vnd.iptc.anpa"); |
| |
| private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(TYPE); |
| private final static char SOH = 0x01; // start of header (ctrl-a) |
| private final static char STX = 0x02; // start of text (ctrl-b) |
| private final static char ETX = 0x03; // end of text (ctrl-c) |
| private final static char EOT = 0x04; // the tab character (ctrl-d) |
| private final static char SYN = 0x16; // synchronous idle (ctrl-v) |
| private final static char BS = 0x08; // the backspace character (used for diacriticals) |
| private final static char TB = 0x09; // the tab character |
| private final static char LF = 0x0A; // line feed |
| private final static char FF = 0x0C; // form feed |
| private final static char CR = 0x0D; // carriage return |
| private final static char XQ = 0x11; // device control (ctrl-q) |
| private final static char XS = 0x13; // device control (ctrl-s) |
| private final static char FS = 0x1F; // a field delimiter |
| private final static char HY = 0x2D; // hyphen |
| private final static char SP = 0x20; // the blank space |
| private final static char LT = 0x3C; // less than |
| private final static char EQ = 0x3D; // less than |
| private final static char CT = 0x5E; // carat |
| private final static char SL = 0x91; // single-quote left |
| private final static char SR = 0x92; // single-quote right |
| private final static char DL = 0x93; // double-quote left |
| private final static char DR = 0x94; // double-quote right |
| private int FMT_ANPA_1312 = 0x00; // "NAA 89-3 (ANPA 1312)" |
| private int FMT_ANPA_UPI = 0x01; // "United Press International ANPA 1312 variant" |
| private int FMT_ANPA_UPI_DL = 0x02; // "United Press International Down-Load Message" |
| private int FMT_IPTC_7901 = 0x03; // "IPTC7901 Recommended Message Format" |
| private int FMT_IPTC_PHOTO = 0x04; // "IPTC-NAA Digital Newsphoto Parameter Record" |
| private int FMT_IPTC_CHAR = 0x05; |
| // "IPTC Unstructured Character Oriented File Format (UCOFF)" |
| private int FMT_NITF = 0x06; // "News Industry Text Format (NITF)" |
| private int FMT_NITF_TT = 0x07; // "Tidningarnas Telegrambyra NITF version (TTNITF DTD)" |
| private int FMT_NITF_RB = 0x08; // "Ritzaus Bureau NITF version (RBNITF DTD)" |
| private int FMT_IPTC_AP = 0x09; // "Associated Press news wire format" |
| private int FMT_IPTC_BLM = 0x0A; // "Bloomberg News news wire format" |
| private int FMT_IPTC_NYT = 0x0B; // "New York Times news wire format" |
| private int FMT_IPTC_RTR = 0x0C; // "Reuters news wire format" |
| private int FORMAT = FMT_ANPA_1312; // assume the default format to be ANPA-1312 |
| |
| public Set<MediaType> getSupportedTypes(ParseContext context) { |
| return SUPPORTED_TYPES; |
| } |
| |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException, TikaException { |
| |
| HashMap<String, String> properties = this.loadProperties(stream); |
| this.setMetadata(metadata, properties); |
| |
| XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); |
| xhtml.startDocument(); |
| // TODO: put body content here |
| xhtml.startElement("p"); |
| String body = clean(properties.get("body")); |
| if (body != null) { |
| xhtml.characters(body); |
| } |
| xhtml.endElement("p"); |
| xhtml.endDocument(); |
| } |
| |
| /** |
| * @deprecated This method will be removed in Apache Tika 1.0. |
| */ |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata) |
| throws IOException, SAXException, TikaException { |
| parse(stream, handler, metadata, new ParseContext()); |
| } |
| |
| /** |
| * scan the news messsage and store the metadata and data into a map |
| */ |
| private HashMap<String, String> loadProperties(InputStream is) { |
| |
| HashMap<String, String> properties = new HashMap<>(); |
| |
| FORMAT = this.scanFormat(is); |
| |
| byte[] residual = this.getSection(is, "residual"); |
| |
| byte[] header = this.getSection(is, "header"); |
| parseHeader(header, properties); |
| |
| byte[] body = this.getSection(is, "body"); |
| parseBody(body, properties); |
| |
| byte[] footer = this.getSection(is, "footer"); |
| parseFooter(footer, properties); |
| |
| return (properties); |
| } |
| |
| |
| private int scanFormat(InputStream is) { |
| int format = this.FORMAT; |
| int maxsize = 524288; // 512K |
| |
| byte[] buf = new byte[maxsize]; |
| try { |
| if (is.markSupported()) { |
| is.mark(maxsize); |
| } |
| int msgsize = is.read(buf); // read in at least the full data |
| |
| String message = (new String(buf, UTF_8)).toLowerCase(Locale.ROOT); |
| // these are not if-then-else, because we want to go from most common |
| // and fall through to least. this is imperfect, as these tags could |
| // show up in other agency stories, but i can't find a spec or any |
| // explicit codes to identify the wire source in the message itself |
| |
| if (message.contains("ap-wf")) { |
| format = this.FMT_IPTC_AP; |
| } |
| if (message.contains("reuters")) { |
| format = this.FMT_IPTC_RTR; |
| } |
| if (message.contains("new york times")) { |
| format = this.FMT_IPTC_NYT; |
| } |
| if (message.contains("bloomberg news")) { |
| format = this.FMT_IPTC_BLM; |
| } |
| } catch (IOException eio) { |
| // we are in an unstable state |
| } |
| |
| try { |
| if (is.markSupported()) { |
| is.reset(); |
| } |
| } catch (IOException eio) { |
| // we are in an unstable state |
| } |
| return (format); |
| } |
| |
| |
| private void setFormat(int format) { |
| this.FORMAT = format; |
| } |
| |
| |
| private String getFormatName() { |
| |
| String name = ""; |
| |
| if (FORMAT == this.FMT_IPTC_AP) { |
| name = "Associated Press"; |
| } else if (FORMAT == this.FMT_IPTC_BLM) { |
| name = "Bloomberg"; |
| } else if (FORMAT == this.FMT_IPTC_NYT) { |
| name = "New York Times"; |
| } else if (FORMAT == this.FMT_IPTC_RTR) { |
| name = "Reuters"; |
| } |
| |
| return (name); |
| } |
| |
| |
| private byte[] getSection(InputStream is, String name) { |
| |
| byte[] value = new byte[0]; |
| |
| switch (name) { |
| case "residual": { |
| // the header shouldn't be more than 1k, but just being generous here |
| int maxsize = 8192; // 8K |
| |
| byte bstart = |
| SYN; // check for SYN [0x16 : ctrl-v] (may have leftover residue from |
| |
| // preceding message) |
| byte bfinish = |
| SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN |
| |
| // [0x16 : ctrl-v]) |
| value = getSection(is, maxsize, bstart, bfinish, true); |
| break; |
| } |
| case "header": { |
| // the header shouldn't be more than 1k, but just being generous here |
| int maxsize = 8192; // 8K |
| |
| byte bstart = |
| SOH; // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN |
| |
| // [0x16 : ctrl-v]) |
| byte bfinish = |
| STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of |
| |
| // message) |
| value = getSection(is, maxsize, bstart, bfinish, true); |
| break; |
| } |
| case "body": { |
| // the message shouldn't be more than 16k (?), leaving plenty of space |
| int maxsize = 524288; // 512K |
| |
| byte bstart = |
| STX; // check for STX [0x02 : ctrl-b] (marks end of header, beginning of |
| |
| // message) |
| byte bfinish = |
| ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of |
| |
| // footer) |
| value = getSection(is, maxsize, bstart, bfinish, true); |
| break; |
| } |
| case "footer": { |
| // the footer shouldn't be more than 1k , leaving plenty of space |
| int maxsize = 8192; // 8K |
| |
| byte bstart = |
| ETX; // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of |
| |
| // footer) |
| byte bfinish = EOT; // check for EOT [0x04 : ctrl-d] (marks end of transmission) |
| |
| value = getSection(is, maxsize, bstart, bfinish, true); |
| break; |
| } |
| } |
| |
| return (value); |
| } |
| |
| |
| private byte[] getSection(InputStream is, int maxsize, byte bstart, byte bfinish, |
| boolean ifincomplete) { |
| byte[] value = new byte[0]; |
| |
| try { |
| boolean started = false; // check if we have found the start flag |
| boolean finished = false; // check if we have found the finish flag |
| int read = 0; // the number of bytes we read |
| int start = 0; // the position after the start flag |
| |
| // TODO: this only pulls back 8K of data on a read, regardless of buffer size |
| // more nefariously, it caps at a total 8K, through all sections |
| int streammax = is.available(); |
| maxsize = Math.min(maxsize, streammax); |
| |
| is.mark(maxsize); |
| byte[] buf = new byte[maxsize]; |
| int totsize = 0; |
| int remainder = maxsize - totsize; |
| while (remainder > 0) { |
| int msgsize = is.read(buf, maxsize - remainder, |
| maxsize); // read in at least the full data |
| if (msgsize == -1) { |
| remainder = msgsize = 0; |
| } |
| remainder -= msgsize; |
| totsize += msgsize; |
| } |
| |
| // scan through the provided input stream |
| for (read = 0; read < totsize; read++) { |
| byte b = buf[read]; |
| |
| if (!started) { |
| started = (b == bstart); |
| start = read + 1; |
| continue; |
| } |
| |
| if (finished = (b == bfinish)) { |
| /* |
| is.reset(); |
| long skipped = is.skip((long)read); |
| if (skipped != read) { |
| // we are in an unstable state |
| } |
| is.mark(1); |
| */ |
| break; |
| } |
| |
| // load from the stream until we run out of characters, or hit the termination byte |
| continue; |
| } |
| |
| // move the input stream back to where it was initially |
| is.reset(); |
| |
| if (finished) { |
| // now, we want to reset the stream to be sitting right on top of the finish marker |
| is.skip(read); |
| value = new byte[read - start]; |
| System.arraycopy(buf, start, value, 0, read - start); |
| } else { |
| if (ifincomplete && started) { |
| // the caller wants anything that was read, and we finished the stream or buffer |
| value = new byte[read - start]; |
| System.arraycopy(buf, start, value, 0, read - start); |
| } |
| } |
| } catch (IOException eio) { |
| // something invalid occurred, return an empty string |
| } |
| |
| return (value); |
| } |
| |
| |
| private boolean parseHeader(byte[] value, HashMap<String, String> properties) { |
| boolean added = false; |
| |
| String env_serviceid = ""; |
| String env_category = ""; |
| String env_urgency = ""; |
| String hdr_edcode = ""; |
| StringBuilder hdr_subject = new StringBuilder(); |
| StringBuilder hdr_date = new StringBuilder(); |
| StringBuilder hdr_time = new StringBuilder(); |
| |
| int read = 0; |
| |
| while (read < value.length) { |
| |
| // pull apart the envelope, getting the service id (....\x1f) |
| while (read < value.length) { |
| byte val_next = value[read++]; |
| if (val_next != FS) { |
| env_serviceid += |
| (char) (val_next & 0xff); // convert the byte to an unsigned int |
| } else { |
| break; |
| } |
| } |
| |
| // pull apart the envelope, getting the category (....\x13\x11) |
| while (read < value.length) { |
| byte val_next = value[read++]; |
| if (val_next != XS) { // the end of the envelope is marked (\x13) |
| env_category += |
| (char) (val_next & 0xff); // convert the byte to an unsigned int |
| } else { |
| val_next = value[read]; // get the remaining byte (\x11) |
| if (val_next == XQ) { |
| read++; |
| } |
| break; |
| } |
| } |
| |
| // pull apart the envelope, getting the subject heading |
| while (read < value.length) { |
| boolean subject = true; |
| byte val_next = value[read++]; |
| while ((subject) && (val_next != SP) && |
| (val_next != 0x00)) { // ignore the envelope subject |
| hdr_subject.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| while (val_next == SP) { // consume all the spaces |
| subject = false; |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| if (val_next != SP) { |
| --read; // otherwise we eat into the next section |
| } |
| } |
| } |
| if (!subject) { |
| break; |
| } |
| } |
| |
| // pull apart the envelope, getting the date and time |
| while (read < value.length) { |
| byte val_next = value[read++]; |
| if (hdr_date.length() == 0) { |
| while (((val_next >= (byte) 0x30) && (val_next <= (byte) 0x39)) |
| // consume all numerics and hyphens |
| || (val_next == HY)) { |
| hdr_date.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| } else if (val_next == SP) { |
| while (val_next == SP) { // consume all the spaces |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| continue; |
| } else { |
| while (((val_next >= (byte) 0x30) && (val_next <= (byte) 0x39)) |
| // consume all numerics and hyphens |
| || (val_next == HY)) { |
| hdr_time.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| } |
| } |
| break; // don't let this run back through and start thrashing metadata |
| } |
| |
| // if we were saving any of these values, we would set the properties map here |
| |
| added = (env_serviceid.length() + env_category.length() + hdr_subject.length() + |
| hdr_date.length() + hdr_time.length()) > 0; |
| return added; |
| } |
| |
| private boolean parseBody(byte[] value, HashMap<String, String> properties) { |
| boolean added = false; |
| |
| StringBuilder bdy_heading = new StringBuilder(); |
| StringBuilder bdy_title = new StringBuilder(); |
| StringBuilder bdy_source = new StringBuilder(); |
| StringBuilder bdy_author = new StringBuilder(); |
| StringBuilder bdy_body = new StringBuilder(); |
| |
| int read = 0; |
| boolean done = false; |
| |
| while (!done && (read < value.length)) { |
| |
| // pull apart the body, getting the heading (^....\x0d\x0a) |
| while (read < value.length) { |
| byte val_next = value[read++]; |
| if (val_next == CT) { // start of a new section , first is the heading |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| // AP, NYT, and Bloomberg end with < , Reuters with EOL |
| while ((val_next != LT) && (val_next != CR) && |
| (val_next != LF)) { // less than delimiter (\x3c) and not EOL |
| bdy_heading.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| if (read >= value.length) { |
| break; |
| } // shouldn't ever hit this, but save a NPE |
| } |
| if (val_next == LT) { |
| // hit the delimiter, carry on |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) { |
| val_next = |
| (read < value.length) ? value[read++] : 0x00; // skip the new lines |
| if ((val_next != CR) && (val_next != LF)) { |
| --read; |
| } |
| } |
| } else { |
| // this will only be hit on poorly-formed files |
| |
| // for reuters, the heading does not start with the ^, so we push one back |
| // into the stream |
| if (FORMAT == this.FMT_IPTC_RTR) { |
| if (val_next != CT) { |
| // for any non-whitespace, we need to go back an additional step to |
| // non destroy the data |
| if ((val_next != SP) && (val_next != TB) && (val_next != CR) && |
| (val_next != LF)) { |
| // if the very first byte is data, we have to shift the whole |
| // array, and stuff in a carat |
| if (read == 1) { |
| byte[] resize = new byte[value.length + 1]; |
| System.arraycopy(value, 0, resize, 1, value.length); |
| value = resize; |
| } |
| } |
| value[--read] = CT; |
| continue; |
| } |
| } |
| } |
| break; |
| } |
| |
| // pull apart the body, getting the title (^....\x0d\x0a) |
| while (read < value.length) { |
| byte val_next = value[read++]; |
| if (val_next == CT) { // start of a new section , first is the heading |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| // AP, NYT, and Bloomberg end with < , Reuters with EOL |
| while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next != |
| LF)) { // less than delimiter (\x3c), or carat (\x5e) and not EOL |
| bdy_title.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| if (read >= value.length) { |
| break; |
| } // shouldn't ever hit this, but save a NPE |
| } |
| |
| if (val_next == |
| CT) { // start of a new section , when first didn't finish cleanly |
| --read; |
| } |
| |
| if (val_next == LT) { |
| // hit the delimiter, carry on |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| |
| while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) { |
| val_next = |
| (read < value.length) ? value[read++] : 0x00; // skip the new lines |
| if ((val_next != CR) && (val_next != LF)) { |
| --read; |
| } |
| } |
| } else { |
| // this will only be hit on poorly-formed files |
| |
| // for bloomberg, the title does not start with the ^, so we push one back |
| // into the stream |
| if (FORMAT == this.FMT_IPTC_BLM) { |
| if (val_next == TB) { |
| value[--read] = CT; |
| continue; |
| } |
| } |
| |
| // for reuters, the title does not start with the ^, so we push one back into |
| // the stream |
| if (FORMAT == this.FMT_IPTC_RTR) { |
| if (val_next != CT) { |
| // for any non-whitespace, we need to go back an additional step to |
| // non destroy the data |
| if ((val_next != SP) && (val_next != TB) && (val_next != CR) && |
| (val_next != LF)) { |
| --read; |
| } |
| value[--read] = CT; |
| continue; |
| } |
| } |
| } |
| break; |
| } |
| |
| |
| // at this point, we have a variable number of metadata lines, with various orders |
| // we scan the start of each line for the special character, and run to the end |
| // character |
| // pull apart the body, getting the title (^....\x0d\x0a) |
| boolean metastarted = false; |
| String longline = ""; |
| String longkey; |
| while (read < value.length) { |
| byte val_next = value[read++]; |
| |
| // eat up whitespace before committing to the next section |
| if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) { |
| continue; |
| } |
| |
| if (val_next == |
| CT) { // start of a new section , could be authors, sources, etc |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| StringBuilder tmp_line = new StringBuilder(); |
| while ((val_next != LT) && (val_next != CT) && (val_next != CR) && |
| (val_next != LF) && (val_next != 0)) { |
| // less than delimiter (\x3c), maybe also badly formed with just new line |
| tmp_line.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| if (read >= value.length) { |
| break; |
| } // shouldn't ever hit this, but save a NPE |
| } |
| |
| if (val_next == |
| CT) { // start of a new section , when first didn't finish cleanly |
| --read; |
| } |
| |
| if (val_next == LT) { |
| // hit the delimiter, carry on |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| |
| while ((val_next == CR) || (val_next == LF)) { |
| val_next = |
| (read < value.length) ? value[read++] : 0x00; // skip the new lines |
| if ((val_next != CR) && (val_next != LF)) { |
| --read; |
| } |
| } |
| if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("by") || |
| longline.equals("bdy_author")) { |
| longkey = "bdy_author"; |
| |
| // prepend a space to subsequent line, so it gets parsed consistent with |
| // the lead line |
| tmp_line.insert(0, (longline.equals(longkey) ? " " : "")); |
| |
| // we have an author candidate |
| int term = tmp_line.length(); |
| term = Math.min(term, |
| (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term)); |
| term = Math.min(term, |
| (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term)); |
| term = Math.min(term, |
| (tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term)); |
| term = (term > 0) ? term : tmp_line.length(); |
| bdy_author.append(tmp_line.substring(tmp_line.indexOf(" "), term)); |
| metastarted = true; |
| longline = |
| ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ? longkey : |
| ""); |
| } else if (FORMAT == this.FMT_IPTC_BLM) { |
| String byline = " by "; |
| if (tmp_line.toString().toLowerCase(Locale.ROOT).contains(byline)) { |
| longkey = "bdy_author"; |
| |
| int term = tmp_line.length(); |
| term = Math.min(term, |
| (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term)); |
| term = Math.min(term, |
| (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term)); |
| term = Math.min(term, |
| (tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term)); |
| term = (term > 0) ? term : tmp_line.length(); |
| // for bloomberg, the author line sits below their copyright statement |
| bdy_author.append(tmp_line.substring( |
| tmp_line.toString().toLowerCase(Locale.ROOT).indexOf(byline) + |
| byline.length(), term)).append(" "); |
| metastarted = true; |
| longline = ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ? |
| longkey : ""); |
| } else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("c.")) { |
| // the author line for bloomberg is a multiline starting with c.2011 |
| // Bloomberg News |
| // then containing the author info on the next line |
| if (val_next == TB) { |
| value[--read] = CT; |
| continue; |
| } |
| } else if (tmp_line.toString().toLowerCase(Locale.ROOT).trim().startsWith("(") && |
| tmp_line.toString().toLowerCase(Locale.ROOT).trim().endsWith(")")) { |
| // the author line may have one or more comment lines between the |
| // copyright |
| // statement, and the By AUTHORNAME line |
| if (val_next == TB) { |
| value[--read] = CT; |
| continue; |
| } |
| } |
| } else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("eds") || |
| longline.equals("bdy_source")) { |
| longkey = "bdy_source"; |
| // prepend a space to subsequent line, so it gets parsed consistent with |
| // the lead line |
| tmp_line.insert(0, (longline.equals(longkey) ? " " : "")); |
| |
| // we have a source candidate |
| int term = tmp_line.length(); |
| term = Math.min(term, |
| (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term)); |
| term = Math.min(term, |
| (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term)); |
| // term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") : |
| // term)); |
| term = (term > 0) ? term : tmp_line.length(); |
| bdy_source.append(tmp_line.substring(tmp_line.indexOf(" ") + 1, term)).append(" "); |
| metastarted = true; |
| longline = (!longline.equals(longkey) ? longkey : ""); |
| } else { |
| // this has fallen all the way through. trap it as part of the subject, |
| // rather than just losing it |
| if (!metastarted) { |
| bdy_title.append(" , ").append(tmp_line); // not sure where else to put this but in the |
| // title |
| } else { |
| // what to do with stuff that is metadata, which falls after metadata |
| // lines started? |
| bdy_body.append(" ") |
| .append(tmp_line) |
| .append(" , "); // not sure where else to put this but in the title |
| } |
| } |
| } else { // we're on to the main body |
| while ((read < value.length) && (val_next != 0)) { |
| // read until the train runs out of tracks |
| bdy_body.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| if (read >= value.length) { |
| break; |
| } // shouldn't ever hit this, but save a NPE |
| } |
| |
| } |
| // we would normally break here, but just let this read out to the end |
| } |
| done = true; // don't let this run back through and start thrashing metadata |
| } |
| properties.put("body", bdy_body.toString()); |
| properties.put("title", bdy_title.toString()); |
| properties.put("subject", bdy_heading.toString()); |
| properties.put("author", bdy_author.toString()); |
| properties.put("source", bdy_source.toString()); |
| |
| added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() + |
| bdy_author.length() + bdy_source.length()) > 0; |
| return added; |
| } |
| |
| |
| private boolean parseFooter(byte[] value, HashMap<String, String> properties) { |
| boolean added = false; |
| |
| StringBuilder ftr_source = new StringBuilder(); |
| String ftr_datetime = ""; |
| |
| int read = 0; |
| boolean done = false; |
| |
| while (!done && (read < value.length)) { |
| |
| // pull apart the footer, getting the news feed source (^....\x0d\x0a) |
| byte val_next = value[read++]; |
| byte val_peek = (read < value.length) ? value[read + 1] : 0x00; // skip the new lines |
| |
| while (((val_next < (byte) 0x30) || (val_next > (byte) 0x39)) && |
| (val_next != 0)) { // consume all non-numerics first |
| ftr_source.append((char) (val_next & 0xff)); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read] : |
| 0x00; // attempt to read until end of stream |
| read++; |
| if (read >= value.length) { |
| break; |
| } // shouldn't ever hit this, but save a NPE |
| } |
| |
| while ((val_next != LT) && (val_next != CR) && (val_next != LF) && |
| (val_next != 0)) { // get as much timedate as possible |
| // this is an american format, so arrives as mm-dd-yy HHiizzz |
| ftr_datetime += (char) (val_next & 0xff); // convert the byte to an unsigned int |
| val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines |
| if (read >= value.length) { |
| break; |
| } // shouldn't ever hit this, but save a NPE |
| } |
| if (val_next == LT) { |
| // hit the delimiter, carry on |
| val_next = (read < value.length) ? value[read++] : 0x00; |
| } |
| |
| if (ftr_datetime.length() > 0) { |
| // we want to pass this back in a more friendly format |
| String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'"; |
| Date dateunix = new Date(); |
| try { |
| // standard ap format |
| String format_in = "MM-dd-yy HHmmzzz"; |
| |
| if (FORMAT == this.FMT_IPTC_RTR) { |
| // standard reuters format |
| format_in = "HH:mm MM-dd-yy"; |
| } |
| SimpleDateFormat dfi = new SimpleDateFormat(format_in, Locale.ROOT); |
| dfi.setTimeZone(TimeZone.getTimeZone("UTC")); |
| dateunix = dfi.parse(ftr_datetime); |
| } catch (ParseException ep) { |
| // failed, but this will just fall through to setting the date to now |
| } |
| SimpleDateFormat dfo = new SimpleDateFormat(format_out, Locale.ROOT); |
| dfo.setTimeZone(TimeZone.getTimeZone("UTC")); |
| ftr_datetime = dfo.format(dateunix); |
| } |
| while ((val_next == CR) || (val_next == LF)) { |
| val_next = (read < value.length) ? value[read++] : 0x00; // skip the new lines |
| if ((val_next != CR) && (val_next != LF)) { |
| --read; |
| } |
| } |
| done = true; // don't let this run back through and start thrashing metadata |
| } |
| |
| properties.put("publisher", ftr_source.toString()); |
| properties.put("created", ftr_datetime); |
| properties.put("modified", ftr_datetime); |
| |
| added = (ftr_source.length() + ftr_datetime.length()) > 0; |
| return added; |
| } |
| |
| |
| private void setMetadata(Metadata metadata, HashMap<String, String> properties) { |
| |
| // every property that gets set must be non-null, or it will cause NPE |
| // in other consuming applications, like Lucene |
| metadata.set(Metadata.CONTENT_TYPE, clean("text/anpa-1312")); |
| metadata.set(TikaCoreProperties.TITLE, clean(properties.get("title"))); |
| metadata.set(TikaCoreProperties.SUBJECT, clean(properties.get("subject"))); |
| metadata.set(TikaCoreProperties.CREATOR, clean(properties.get("author"))); |
| metadata.set(TikaCoreProperties.CREATED, clean(properties.get("created"))); |
| metadata.set(TikaCoreProperties.MODIFIED, clean(properties.get("modified"))); |
| metadata.set(TikaCoreProperties.SOURCE, clean(properties.get("source"))); |
| // metadata.set(TikaCoreProperties.PUBLISHER, clean(properties.get("publisher"))); |
| metadata.set(TikaCoreProperties.PUBLISHER, clean(this.getFormatName())); |
| |
| /* |
| metadata.set(TikaCoreProperties.DATE, font.getHeader().getCreated().getTime()); |
| metadata.set( |
| Property.internalDate(TikaCoreProperties.MODIFIED), |
| font.getHeader().getModified().getTime()); |
| */ |
| } |
| |
| private String clean(String value) { |
| if (value == null) { |
| value = ""; |
| } |
| |
| value = value.replaceAll("``", "`"); |
| value = value.replaceAll("''", "'"); |
| value = value.replaceAll(new String(new char[]{SL}), "'"); |
| value = value.replaceAll(new String(new char[]{SR}), "'"); |
| value = value.replaceAll(new String(new char[]{DL}), "\""); |
| value = value.replaceAll(new String(new char[]{DR}), "\""); |
| value = value.trim(); |
| |
| return (value); |
| } |
| } |