tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-news-module/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.iptc;

 import static java.nio.charset.StandardCharsets.UTF_8;

 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Set;
 import java.util.TimeZone;

 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;

 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;

 /**
  * Parser for IPTC ANPA New Wire Feeds
  */
 public class IptcAnpaParser implements Parser {
     /**
      * Serial version UID
      */
     private static final long serialVersionUID = -6062820170212879115L;

     private static final MediaType TYPE = MediaType.text("vnd.iptc.anpa");

     private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(TYPE);
     private final static char SOH = 0x01;    // start of header (ctrl-a)
     private final static char STX = 0x02;    // start of text (ctrl-b)
     private final static char ETX = 0x03;    // end of text (ctrl-c)
     private final static char EOT = 0x04;    // the tab character (ctrl-d)
     private final static char SYN = 0x16;    // synchronous idle (ctrl-v)
     private final static char BS = 0x08;    // the backspace character (used for diacriticals)
     private final static char TB = 0x09;    // the tab character
     private final static char LF = 0x0A;    // line feed
     private final static char FF = 0x0C;    // form feed
     private final static char CR = 0x0D;    // carriage return
     private final static char XQ = 0x11;    // device control (ctrl-q)
     private final static char XS = 0x13;    // device control (ctrl-s)
     private final static char FS = 0x1F;    // a field delimiter
     private final static char HY = 0x2D;    // hyphen
     private final static char SP = 0x20;    // the blank space
     private final static char LT = 0x3C;    // less than
     private final static char EQ = 0x3D;    // less than
     private final static char CT = 0x5E;    // carat
     private final static char SL = 0x91;    // single-quote left
     private final static char SR = 0x92;    // single-quote right
     private final static char DL = 0x93;    // double-quote left
     private final static char DR = 0x94;    // double-quote right
     private int FMT_ANPA_1312 = 0x00;   // "NAA 89-3 (ANPA 1312)"
     private int FMT_ANPA_UPI = 0x01;   // "United Press International ANPA 1312 variant"
     private int FMT_ANPA_UPI_DL = 0x02;   // "United Press International Down-Load Message"
     private int FMT_IPTC_7901 = 0x03;   // "IPTC7901 Recommended Message Format"
     private int FMT_IPTC_PHOTO = 0x04;   // "IPTC-NAA Digital Newsphoto Parameter Record"
     private int FMT_IPTC_CHAR = 0x05;
             // "IPTC Unstructured Character Oriented File Format (UCOFF)"
     private int FMT_NITF = 0x06;   // "News Industry Text Format (NITF)"
     private int FMT_NITF_TT = 0x07;   // "Tidningarnas Telegrambyra NITF version (TTNITF DTD)"
     private int FMT_NITF_RB = 0x08;   // "Ritzaus Bureau NITF version (RBNITF DTD)"
     private int FMT_IPTC_AP = 0x09;   // "Associated Press news wire format"
     private int FMT_IPTC_BLM = 0x0A;   // "Bloomberg News news wire format"
     private int FMT_IPTC_NYT = 0x0B;   // "New York Times news wire format"
     private int FMT_IPTC_RTR = 0x0C;   // "Reuters news wire format"
     private int FORMAT = FMT_ANPA_1312;    // assume the default format to be ANPA-1312

     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }

     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                       ParseContext context) throws IOException, SAXException, TikaException {

         HashMap<String, String> properties = this.loadProperties(stream);
         this.setMetadata(metadata, properties);

         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         // TODO: put body content here
         xhtml.startElement("p");
         String body = clean(properties.get("body"));
         if (body != null) {
             xhtml.characters(body);
         }
         xhtml.endElement("p");
         xhtml.endDocument();
     }

     /**
      * @deprecated This method will be removed in Apache Tika 1.0.
      */
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
         parse(stream, handler, metadata, new ParseContext());
     }

     /**
      * scan the news messsage and store the metadata and data into a map
      */
     private HashMap<String, String> loadProperties(InputStream is) {

         HashMap<String, String> properties = new HashMap<>();

         FORMAT = this.scanFormat(is);

         byte[] residual = this.getSection(is, "residual");

         byte[] header = this.getSection(is, "header");
         parseHeader(header, properties);

         byte[] body = this.getSection(is, "body");
         parseBody(body, properties);

         byte[] footer = this.getSection(is, "footer");
         parseFooter(footer, properties);

         return (properties);
     }


     private int scanFormat(InputStream is) {
         int format = this.FORMAT;
         int maxsize = 524288;     //  512K

         byte[] buf = new byte[maxsize];
         try {
             if (is.markSupported()) {
                 is.mark(maxsize);
             }
             int msgsize = is.read(buf);                // read in at least the full data

             String message = (new String(buf, UTF_8)).toLowerCase(Locale.ROOT);
             // these are not if-then-else, because we want to go from most common
             // and fall through to least.  this is imperfect, as these tags could
             // show up in other agency stories, but i can't find a spec or any
             // explicit codes to identify the wire source in the message itself

             if (message.contains("ap-wf")) {
                 format = this.FMT_IPTC_AP;
             }
             if (message.contains("reuters")) {
                 format = this.FMT_IPTC_RTR;
             }
             if (message.contains("new york times")) {
                 format = this.FMT_IPTC_NYT;
             }
             if (message.contains("bloomberg news")) {
                 format = this.FMT_IPTC_BLM;
             }
         } catch (IOException eio) {
             // we are in an unstable state
         }

         try {
             if (is.markSupported()) {
                 is.reset();
             }
         } catch (IOException eio) {
             // we are in an unstable state
         }
         return (format);
     }


     private void setFormat(int format) {
         this.FORMAT = format;
     }


     private String getFormatName() {

         String name = "";

         if (FORMAT == this.FMT_IPTC_AP) {
             name = "Associated Press";
         } else if (FORMAT == this.FMT_IPTC_BLM) {
             name = "Bloomberg";
         } else if (FORMAT == this.FMT_IPTC_NYT) {
             name = "New York Times";
         } else if (FORMAT == this.FMT_IPTC_RTR) {
             name = "Reuters";
         }

         return (name);
     }


     private byte[] getSection(InputStream is, String name) {

         byte[] value = new byte[0];

         switch (name) {
             case "residual": {
                 // the header shouldn't be more than 1k, but just being generous here
                 int maxsize = 8192;     //  8K

                 byte bstart =
                         SYN;     // check for SYN [0x16 : ctrl-v] (may have leftover residue from

                 // preceding message)
                 byte bfinish =
                         SOH;     // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN

                 // [0x16 : ctrl-v])
                 value = getSection(is, maxsize, bstart, bfinish, true);
                 break;
             }
             case "header": {
                 // the header shouldn't be more than 1k, but just being generous here
                 int maxsize = 8192;     //  8K

                 byte bstart =
                         SOH;     // check for SOH [0x01 : ctrl-a] (typically follows a pair of SYN

                 // [0x16 : ctrl-v])
                 byte bfinish =
                         STX;     // check for STX [0x02 : ctrl-b] (marks end of header, beginning of

                 // message)
                 value = getSection(is, maxsize, bstart, bfinish, true);
                 break;
             }
             case "body": {
                 // the message shouldn't be more than 16k (?), leaving plenty of space
                 int maxsize = 524288;     //  512K

                 byte bstart =
                         STX;     // check for STX [0x02 : ctrl-b] (marks end of header, beginning of

                 // message)
                 byte bfinish =
                         ETX;     // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of

                 // footer)
                 value = getSection(is, maxsize, bstart, bfinish, true);
                 break;
             }
             case "footer": {
                 // the footer shouldn't be more than 1k , leaving plenty of space
                 int maxsize = 8192;     //  8K

                 byte bstart =
                         ETX;     // check for ETX [0x03 : ctrl-c] (marks end of message, beginning of

                 // footer)
                 byte bfinish = EOT;     // check for EOT [0x04 : ctrl-d] (marks end of transmission)

                 value = getSection(is, maxsize, bstart, bfinish, true);
                 break;
             }
         }

         return (value);
     }


     private byte[] getSection(InputStream is, int maxsize, byte bstart, byte bfinish,
                               boolean ifincomplete) {
         byte[] value = new byte[0];

         try {
             boolean started = false;                   // check if we have found the start flag
             boolean finished = false;                  // check if we have found the finish flag
             int read = 0;                              // the number of bytes we read
             int start = 0;                             // the position after the start flag

             // TODO: this only pulls back 8K of data on a read, regardless of buffer size
             //       more nefariously, it caps at a total 8K, through all sections
             int streammax = is.available();
             maxsize = Math.min(maxsize, streammax);

             is.mark(maxsize);
             byte[] buf = new byte[maxsize];
             int totsize = 0;
             int remainder = maxsize - totsize;
             while (remainder > 0) {
                 int msgsize = is.read(buf, maxsize - remainder,
                         maxsize);    // read in at least the full data
                 if (msgsize == -1) {
                     remainder = msgsize = 0;
                 }
                 remainder -= msgsize;
                 totsize += msgsize;
             }

             // scan through the provided input stream
             for (read = 0; read < totsize; read++) {
                 byte b = buf[read];

                 if (!started) {
                     started = (b == bstart);
                     start = read + 1;
                     continue;
                 }

                 if (finished = (b == bfinish)) {
 /*
                is.reset();
                long skipped = is.skip((long)read);
                if (skipped != read) {
                   // we are in an unstable state
                }
                is.mark(1);
  */
                     break;
                 }

                 // load from the stream until we run out of characters, or hit the termination byte
                 continue;
             }

             // move the input stream back to where it was initially
             is.reset();

             if (finished) {
                 // now, we want to reset the stream to be sitting right on top of the finish marker
                 is.skip(read);
                 value = new byte[read - start];
                 System.arraycopy(buf, start, value, 0, read - start);
             } else {
                 if (ifincomplete && started) {
                     // the caller wants anything that was read, and we finished the stream or buffer
                     value = new byte[read - start];
                     System.arraycopy(buf, start, value, 0, read - start);
                 }
             }
         } catch (IOException eio) {
             // something invalid occurred, return an empty string
         }

         return (value);
     }


     private boolean parseHeader(byte[] value, HashMap<String, String> properties) {
         boolean added = false;

         String env_serviceid = "";
         String env_category = "";
         String env_urgency = "";
         String hdr_edcode = "";
         StringBuilder hdr_subject = new StringBuilder();
         StringBuilder hdr_date = new StringBuilder();
         StringBuilder hdr_time = new StringBuilder();

         int read = 0;

         while (read < value.length) {

             // pull apart the envelope, getting the service id  (....\x1f)
             while (read < value.length) {
                 byte val_next = value[read++];
                 if (val_next != FS) {
                     env_serviceid +=
                             (char) (val_next & 0xff);  // convert the byte to an unsigned int
                 } else {
                     break;
                 }
             }

             // pull apart the envelope, getting the category  (....\x13\x11)
             while (read < value.length) {
                 byte val_next = value[read++];
                 if (val_next != XS) {   // the end of the envelope is marked (\x13)
                     env_category +=
                             (char) (val_next & 0xff);  // convert the byte to an unsigned int
                 } else {
                     val_next = value[read];  // get the remaining byte (\x11)
                     if (val_next == XQ) {
                         read++;
                     }
                     break;
                 }
             }

             // pull apart the envelope, getting the subject heading
             while (read < value.length) {
                 boolean subject = true;
                 byte val_next = value[read++];
                 while ((subject) && (val_next != SP) &&
                         (val_next != 0x00)) {  // ignore the envelope subject
                     hdr_subject.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                     val_next = (read < value.length) ? value[read++] : 0x00;
                     while (val_next == SP) {  // consume all the spaces
                         subject = false;
                         val_next = (read < value.length) ? value[read++] : 0x00;
                         if (val_next != SP) {
                             --read;  // otherwise we eat into the next section
                         }
                     }
                 }
                 if (!subject) {
                     break;
                 }
             }

             // pull apart the envelope, getting the date and time
             while (read < value.length) {
                 byte val_next = value[read++];
                 if (hdr_date.length() == 0) {
                     while (((val_next >= (byte) 0x30) && (val_next <= (byte) 0x39))
                             // consume all numerics and hyphens
                             || (val_next == HY)) {
                         hdr_date.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                         val_next = (read < value.length) ? value[read++] : 0x00;
                     }
                 } else if (val_next == SP) {
                     while (val_next == SP) {  // consume all the spaces
                         val_next = (read < value.length) ? value[read++] : 0x00;
                     }
                     continue;
                 } else {
                     while (((val_next >= (byte) 0x30) && (val_next <= (byte) 0x39))
                             // consume all numerics and hyphens
                             || (val_next == HY)) {
                         hdr_time.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                         val_next = (read < value.length) ? value[read++] : 0x00;
                     }
                 }
             }
             break; // don't let this run back through and start thrashing metadata
         }

         // if we were saving any of these values, we would set the properties map here

         added = (env_serviceid.length() + env_category.length() + hdr_subject.length() +
                 hdr_date.length() + hdr_time.length()) > 0;
         return added;
     }

     private boolean parseBody(byte[] value, HashMap<String, String> properties) {
         boolean added = false;

         StringBuilder bdy_heading = new StringBuilder();
         StringBuilder bdy_title = new StringBuilder();
         StringBuilder bdy_source = new StringBuilder();
         StringBuilder bdy_author = new StringBuilder();
         StringBuilder bdy_body = new StringBuilder();

         int read = 0;
         boolean done = false;

         while (!done && (read < value.length)) {

             // pull apart the body, getting the heading (^....\x0d\x0a)
             while (read < value.length) {
                 byte val_next = value[read++];
                 if (val_next == CT) {      //  start of a new section , first is the heading
                     val_next = (read < value.length) ? value[read++] : 0x00;
                     // AP, NYT, and Bloomberg end with < , Reuters with EOL
                     while ((val_next != LT) && (val_next != CR) &&
                             (val_next != LF)) {   // less than delimiter (\x3c) and not EOL
                         bdy_heading.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                         val_next = (read < value.length) ? value[read++] : 0x00;
                         if (read >= value.length) {
                             break;
                         }  // shouldn't ever hit this, but save a NPE
                     }
                     if (val_next == LT) {
                         // hit the delimiter, carry on
                         val_next = (read < value.length) ? value[read++] : 0x00;
                     }
                     while (bdy_heading.length() > 0 && ((val_next == CR) || (val_next == LF))) {
                         val_next =
                                 (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                         if ((val_next != CR) && (val_next != LF)) {
                             --read;
                         }
                     }
                 } else {
                     // this will only be hit on poorly-formed files

                     // for reuters, the heading does not start with the ^, so we push one back
                     // into the stream
                     if (FORMAT == this.FMT_IPTC_RTR) {
                         if (val_next != CT) {
                             // for any non-whitespace, we need to go back an additional step to
                             // non destroy the data
                             if ((val_next != SP) && (val_next != TB) && (val_next != CR) &&
                                     (val_next != LF)) {
                                 // if the very first byte is data, we have to shift the whole
                                 // array, and stuff in a carat
                                 if (read == 1) {
                                     byte[] resize = new byte[value.length + 1];
                                     System.arraycopy(value, 0, resize, 1, value.length);
                                     value = resize;
                                 }
                             }
                             value[--read] = CT;
                             continue;
                         }
                     }
                 }
                 break;
             }

             // pull apart the body, getting the title (^....\x0d\x0a)
             while (read < value.length) {
                 byte val_next = value[read++];
                 if (val_next == CT) {      //  start of a new section , first is the heading
                     val_next = (read < value.length) ? value[read++] : 0x00;
                     // AP, NYT, and Bloomberg end with < , Reuters with EOL
                     while ((val_next != LT) && (val_next != CT) && (val_next != CR) && (val_next !=
                             LF)) {   // less than delimiter (\x3c), or carat (\x5e) and not EOL
                         bdy_title.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                         val_next = (read < value.length) ? value[read++] : 0x00;
                         if (read >= value.length) {
                             break;
                         }  // shouldn't ever hit this, but save a NPE
                     }

                     if (val_next ==
                             CT) {      //  start of a new section , when first didn't finish cleanly
                         --read;
                     }

                     if (val_next == LT) {
                         // hit the delimiter, carry on
                         val_next = (read < value.length) ? value[read++] : 0x00;
                     }

                     while (bdy_title.length() > 0 && ((val_next == CR) || (val_next == LF))) {
                         val_next =
                                 (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                         if ((val_next != CR) && (val_next != LF)) {
                             --read;
                         }
                     }
                 } else {
                     // this will only be hit on poorly-formed files

                     // for bloomberg, the title does not start with the ^, so we push one back
                     // into the stream
                     if (FORMAT == this.FMT_IPTC_BLM) {
                         if (val_next == TB) {
                             value[--read] = CT;
                             continue;
                         }
                     }

                     // for reuters, the title does not start with the ^, so we push one back into
                     // the stream
                     if (FORMAT == this.FMT_IPTC_RTR) {
                         if (val_next != CT) {
                             // for any non-whitespace, we need to go back an additional step to
                             // non destroy the data
                             if ((val_next != SP) && (val_next != TB) && (val_next != CR) &&
                                     (val_next != LF)) {
                                 --read;
                             }
                             value[--read] = CT;
                             continue;
                         }
                     }
                 }
                 break;
             }


             // at this point, we have a variable number of metadata lines, with various orders
             // we scan the start of each line for the special character, and run to the end
             // character
             // pull apart the body, getting the title (^....\x0d\x0a)
             boolean metastarted = false;
             String longline = "";
             String longkey;
             while (read < value.length) {
                 byte val_next = value[read++];

                 // eat up whitespace before committing to the next section
                 if ((val_next == SP) || (val_next == TB) || (val_next == CR) || (val_next == LF)) {
                     continue;
                 }

                 if (val_next ==
                         CT) {      //  start of a new section , could be authors, sources, etc
                     val_next = (read < value.length) ? value[read++] : 0x00;
                     StringBuilder tmp_line = new StringBuilder();
                     while ((val_next != LT) && (val_next != CT) && (val_next != CR) &&
                             (val_next != LF) && (val_next != 0)) {
                         // less than delimiter (\x3c), maybe also badly formed with just new line
                         tmp_line.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                         val_next = (read < value.length) ? value[read++] : 0x00;
                         if (read >= value.length) {
                             break;
                         }  // shouldn't ever hit this, but save a NPE
                     }

                     if (val_next ==
                             CT) {      //  start of a new section , when first didn't finish cleanly
                         --read;
                     }

                     if (val_next == LT) {
                         // hit the delimiter, carry on
                         val_next = (read < value.length) ? value[read++] : 0x00;
                     }

                     while ((val_next == CR) || (val_next == LF)) {
                         val_next =
                                 (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                         if ((val_next != CR) && (val_next != LF)) {
                             --read;
                         }
                     }
                     if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("by") ||
                             longline.equals("bdy_author")) {
                         longkey = "bdy_author";

                         // prepend a space to subsequent line, so it gets parsed consistent with
                         // the lead line
                         tmp_line.insert(0, (longline.equals(longkey) ? " " : ""));

                         // we have an author candidate
                         int term = tmp_line.length();
                         term = Math.min(term,
                                 (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
                         term = Math.min(term,
                                 (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
                         term = Math.min(term,
                                 (tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term));
                         term = (term > 0) ? term : tmp_line.length();
                         bdy_author.append(tmp_line.substring(tmp_line.indexOf(" "), term));
                         metastarted = true;
                         longline =
                                 ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ? longkey :
                                         "");
                     } else if (FORMAT == this.FMT_IPTC_BLM) {
                         String byline = "   by ";
                         if (tmp_line.toString().toLowerCase(Locale.ROOT).contains(byline)) {
                             longkey = "bdy_author";

                             int term = tmp_line.length();
                             term = Math.min(term,
                                     (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
                             term = Math.min(term,
                                     (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
                             term = Math.min(term,
                                     (tmp_line.toString().contains("\n") ? tmp_line.indexOf("\n") : term));
                             term = (term > 0) ? term : tmp_line.length();
                             // for bloomberg, the author line sits below their copyright statement
                             bdy_author.append(tmp_line.substring(
                                     tmp_line.toString().toLowerCase(Locale.ROOT).indexOf(byline) +
                                     byline.length(), term)).append(" ");
                             metastarted = true;
                             longline = ((tmp_line.toString().contains("=")) && (!longline.equals(longkey)) ?
                                     longkey : "");
                         } else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("c.")) {
                             // the author line for bloomberg is a multiline starting with c.2011
                             // Bloomberg News
                             // then containing the author info on the next line
                             if (val_next == TB) {
                                 value[--read] = CT;
                                 continue;
                             }
                         } else if (tmp_line.toString().toLowerCase(Locale.ROOT).trim().startsWith("(") &&
                                    tmp_line.toString().toLowerCase(Locale.ROOT).trim().endsWith(")")) {
                             // the author line may have one or more comment lines between the
                             // copyright
                             // statement, and the By AUTHORNAME line
                             if (val_next == TB) {
                                 value[--read] = CT;
                                 continue;
                             }
                         }
                     } else if (tmp_line.toString().toLowerCase(Locale.ROOT).startsWith("eds") ||
                                longline.equals("bdy_source")) {
                         longkey = "bdy_source";
                         // prepend a space to subsequent line, so it gets parsed consistent with
                         // the lead line
                         tmp_line.insert(0, (longline.equals(longkey) ? " " : ""));

                         // we have a source candidate
                         int term = tmp_line.length();
                         term = Math.min(term,
                                 (tmp_line.toString().contains("<") ? tmp_line.indexOf("<") : term));
                         term = Math.min(term,
                                 (tmp_line.toString().contains("=") ? tmp_line.indexOf("=") : term));
 //                  term = Math.min(term, (tmp_line.indexOf("\n") > -1 ? tmp_line.indexOf("\n") :
 //                  term));
                         term = (term > 0) ? term : tmp_line.length();
                         bdy_source.append(tmp_line.substring(tmp_line.indexOf(" ") + 1, term)).append(" ");
                         metastarted = true;
                         longline = (!longline.equals(longkey) ? longkey : "");
                     } else {
                         // this has fallen all the way through.  trap it as part of the subject,
                         // rather than just losing it
                         if (!metastarted) {
                             bdy_title.append(" , ").append(tmp_line);     //  not sure where else to put this but in the
                             // title
                         } else {
                             // what to do with stuff that is metadata, which falls after metadata
                             // lines started?
                             bdy_body.append(" ")
                                     .append(tmp_line)
                                     .append(" , ");     //  not sure where else to put this but in the title
                         }
                     }
                 } else {  // we're on to the main body
                     while ((read < value.length) && (val_next != 0)) {
                         // read until the train runs out of tracks
                         bdy_body.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                         val_next = (read < value.length) ? value[read++] : 0x00;
                         if (read >= value.length) {
                             break;
                         }  // shouldn't ever hit this, but save a NPE
                     }

                 }
                 // we would normally break here, but just let this read out to the end
             }
             done = true; // don't let this run back through and start thrashing metadata
         }
         properties.put("body", bdy_body.toString());
         properties.put("title", bdy_title.toString());
         properties.put("subject", bdy_heading.toString());
         properties.put("author", bdy_author.toString());
         properties.put("source", bdy_source.toString());

         added = (bdy_body.length() + bdy_title.length() + bdy_heading.length() +
                 bdy_author.length() + bdy_source.length()) > 0;
         return added;
     }


     private boolean parseFooter(byte[] value, HashMap<String, String> properties) {
         boolean added = false;

         StringBuilder ftr_source = new StringBuilder();
         String ftr_datetime = "";

         int read = 0;
         boolean done = false;

         while (!done && (read < value.length)) {

             // pull apart the footer, getting the news feed source (^....\x0d\x0a)
             byte val_next = value[read++];
             byte val_peek = (read < value.length) ? value[read + 1] : 0x00;  // skip the new lines

             while (((val_next < (byte) 0x30) || (val_next > (byte) 0x39)) &&
                     (val_next != 0)) {  // consume all non-numerics first
                 ftr_source.append((char) (val_next & 0xff));  // convert the byte to an unsigned int
                 val_next = (read < value.length) ? value[read] :
                         0x00;  // attempt to read until end of stream
                 read++;
                 if (read >= value.length) {
                     break;
                 }  // shouldn't ever hit this, but save a NPE
             }

             while ((val_next != LT) && (val_next != CR) && (val_next != LF) &&
                     (val_next != 0)) {  // get as much timedate as possible
                 // this is an american format, so arrives as mm-dd-yy HHiizzz
                 ftr_datetime += (char) (val_next & 0xff);  // convert the byte to an unsigned int
                 val_next = (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                 if (read >= value.length) {
                     break;
                 }  // shouldn't ever hit this, but save a NPE
             }
             if (val_next == LT) {
                 // hit the delimiter, carry on
                 val_next = (read < value.length) ? value[read++] : 0x00;
             }

             if (ftr_datetime.length() > 0) {
                 // we want to pass this back in a more friendly format
                 String format_out = "yyyy-MM-dd'T'HH:mm:ss'Z'";
                 Date dateunix = new Date();
                 try {
                     // standard ap format
                     String format_in = "MM-dd-yy HHmmzzz";

                     if (FORMAT == this.FMT_IPTC_RTR) {
                         // standard reuters format
                         format_in = "HH:mm MM-dd-yy";
                     }
                     SimpleDateFormat dfi = new SimpleDateFormat(format_in, Locale.ROOT);
                     dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
                     dateunix = dfi.parse(ftr_datetime);
                 } catch (ParseException ep) {
                     // failed, but this will just fall through to setting the date to now
                 }
                 SimpleDateFormat dfo = new SimpleDateFormat(format_out, Locale.ROOT);
                 dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
                 ftr_datetime = dfo.format(dateunix);
             }
             while ((val_next == CR) || (val_next == LF)) {
                 val_next = (read < value.length) ? value[read++] : 0x00;  // skip the new lines
                 if ((val_next != CR) && (val_next != LF)) {
                     --read;
                 }
             }
             done = true; // don't let this run back through and start thrashing metadata
         }

         properties.put("publisher", ftr_source.toString());
         properties.put("created", ftr_datetime);
         properties.put("modified", ftr_datetime);

         added = (ftr_source.length() + ftr_datetime.length()) > 0;
         return added;
     }


     private void setMetadata(Metadata metadata, HashMap<String, String> properties) {

         // every property that gets set must be non-null, or it will cause NPE
         // in other consuming applications, like Lucene
         metadata.set(Metadata.CONTENT_TYPE, clean("text/anpa-1312"));
         metadata.set(TikaCoreProperties.TITLE, clean(properties.get("title")));
         metadata.set(TikaCoreProperties.SUBJECT, clean(properties.get("subject")));
         metadata.set(TikaCoreProperties.CREATOR, clean(properties.get("author")));
         metadata.set(TikaCoreProperties.CREATED, clean(properties.get("created")));
         metadata.set(TikaCoreProperties.MODIFIED, clean(properties.get("modified")));
         metadata.set(TikaCoreProperties.SOURCE, clean(properties.get("source")));
 //      metadata.set(TikaCoreProperties.PUBLISHER,     clean(properties.get("publisher")));
         metadata.set(TikaCoreProperties.PUBLISHER, clean(this.getFormatName()));

 /*
         metadata.set(TikaCoreProperties.DATE, font.getHeader().getCreated().getTime());
         metadata.set(
                 Property.internalDate(TikaCoreProperties.MODIFIED),
                 font.getHeader().getModified().getTime());
 */
     }

     private String clean(String value) {
         if (value == null) {
             value = "";
         }

         value = value.replaceAll("``", "`");
         value = value.replaceAll("''", "'");
         value = value.replaceAll(new String(new char[]{SL}), "'");
         value = value.replaceAll(new String(new char[]{SR}), "'");
         value = value.replaceAll(new String(new char[]{DL}), "\"");
         value = value.replaceAll(new String(new char[]{DR}), "\"");
         value = value.trim();

         return (value);
     }
 }