uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java - uima-ducc - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.ducc.sampleapps;

 /*
  * This sample Cas Multiplier uses paragraph boundaries to segment a text file,
  * or a part of a text file, into multiple documents. A child CAS is created
  * for each document. Paragraphs that cross block boundaries are processed
  * in the block where they started. An error is thrown if a paragraph crosses
  * two block boundaries.
  *
  * See more information in DUCC Book chapters on sample applications.
  *
  */

 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.channels.FileChannel;
 import java.util.Arrays;
 import java.util.Iterator;

 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.AbstractCas;
 import org.apache.uima.cas.FeatureStructure;
 import org.apache.uima.ducc.Workitem;
 import org.apache.uima.ducc.sampleapps.DuccDocumentInfo;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.util.Level;
 import org.apache.uima.util.Logger;

 public class DuccTextCM extends JCasMultiplier_ImplBase {
   private byte[] buffer = null;
   private int buffsize;
   private FileInputStream fis;
   private String inputFileName;
   private String outputFileName;
   private String language;
   private String encoding;
   private String nextDoc;
   private int nextDocOffset;
   private int bytelength;
   private int blockindex;
   private boolean newWI;
   private boolean spilled;
   private boolean firstdoc;
   private boolean lastblock;
   private int docInWI;
   private long filesize;
   private Workitem wi;
   private int currentindex;
   private Logger logger;
   FileChannel fc;

   private enum NextDoc { FIRSTDOC, SEP_IN_LASTBLOCK, NORMAL };
   private NextDoc strategy;

   private final int DEFAULT_BUFFER_SIZE = 20000000;

   public boolean hasNext() throws AnalysisEngineProcessException {
 	if (spilled) {
 	  return false;
 	}
 	try {
       return findnextdoc(strategy);
 	} catch (IOException e) {
 	  throw new AnalysisEngineProcessException(e);
 	}
   }

   public AbstractCas next() throws AnalysisEngineProcessException {
     JCas newcas = getEmptyJCas();
     newcas.setDocumentText(getNextDocument());
     newcas.setDocumentLanguage(language);
     DuccDocumentInfo di = new DuccDocumentInfo(newcas);
     di.setInputfile(inputFileName);
     di.setOutputfile(outputFileName);
     di.setDocseq(docInWI++);
     di.setByteoffset(wi.getBlockindex() * wi.getBlocksize() + nextDocOffset);
     di.addToIndexes();
     return newcas;
   }

   @Override
   public void process(JCas jcas) throws AnalysisEngineProcessException {
     Iterator<FeatureStructure> fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
     if (!fsit.hasNext()) {
       throw new AnalysisEngineProcessException(new RuntimeException("No workitem FS in CAS"));
     }
     wi = (Workitem) fsit.next();
     logger.log(Level.INFO, "DuccTextCM: "+wi.getInputspec()+" at block "+wi.getBlockindex()+" length "+wi.getBytelength()+
     		" offset "+wi.getBlockindex() * wi.getBlocksize()+" outputs "+wi.getOutputspec());
     try {
       openInputFile(wi);
     } catch (IOException e) {
       throw new AnalysisEngineProcessException(e);
     }

     if (buffer == null) {
       if (wi.getBlocksize()>0) {
     	buffer = new byte[wi.getBlocksize() * 2];
     	buffsize = wi.getBlocksize() * 2;
       }
       else {
     	buffer = new byte[DEFAULT_BUFFER_SIZE];
     	buffsize = DEFAULT_BUFFER_SIZE;
       }
     }
     else {
       if (wi.getBytelength() > buffsize) {
     	buffer = new byte[wi.getBytelength() * 2];
         buffsize = wi.getBytelength();
       }
     }

     spilled = false;
     docInWI = 0;
     strategy = (blockindex == 0) ? NextDoc.FIRSTDOC : NextDoc.NORMAL;
   }


   public void initialize(UimaContext aContext) throws ResourceInitializationException {
     super.initialize(aContext);
     logger = aContext.getLogger();
   }


   private void openInputFile(Workitem wi) throws IOException {
     inputFileName = wi.getInputspec();
     outputFileName = wi.getOutputspec();
     bytelength = wi.getBytelength();
     blockindex = wi.getBlockindex();
     lastblock = wi.getLastBlock();
     language = wi.getLanguage();
     fis = new FileInputStream(new File(inputFileName));
     encoding = (null==wi.getEncoding()) ? "UTF-8" : wi.getEncoding();
     fc = fis.getChannel();
     long start = wi.getBlockindex() * wi.getBlocksize();
     filesize = fc.size();
     if (start > filesize) {
       throw new IOException("Specifid start position beyond end of input file "+inputFileName);
     }
     fis.skip(start);
 	newWI = true;
   }

   private boolean findnextdoc(NextDoc condition) throws IOException {
     int startloc=-1;

     if (newWI) {
       newWI = false;
       int len = fis.read(buffer,0,bytelength);
       if (len != bytelength) {
     	throw new IOException("Read "+len+" bytes, expected "+bytelength);
       }
    	  currentindex = 0;
     }

     if (condition.equals(NextDoc.SEP_IN_LASTBLOCK)) {
     	// separator found at end of last block
     	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
       	  return false;
       	}
       	if (10 == buffer[currentindex]) {
       	  currentindex++; // point at first char in Doc
       	}
       	startloc=currentindex;

         // find end of next doc
         int endloc=0;
         while (currentindex < (bytelength-1)) {
           if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
         	endloc = currentindex - 1;
         	break;
           }
           else {
         	currentindex++;
           }
         }
         if (endloc == 0) {
           throw new RuntimeException("Document larger than "+bytelength+" found in "+inputFileName+" block "+blockindex);
         }
         byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
         nextDoc = new String(docbytes, encoding);
         nextDocOffset = startloc;
         return true;
       }

     if (condition.equals(NextDoc.FIRSTDOC)) {
       // special handling at beginning of first block
       // skip any leading EOL to find start of first doc
       // only execute this once
       strategy = NextDoc.NORMAL;
       while (10 == buffer[currentindex]) {
     	currentindex++;
     	if (currentindex == bytelength) {
     	  if (firstdoc) {
     		return false; // nothing but newlines in this block
     	  }
     	}
       }
     }

     if (condition.equals(NextDoc.NORMAL)) {
     	// currentindex either pointing at start of a segmentation, or
     	// if a new block then possibly the middle of a previous document
       if (!(10 == buffer[currentindex] && 10 == buffer[currentindex+1])) {
       	// in the middle of a spilled Doc. Find next segmentation
       	while (currentindex < (bytelength-1)) {
       	  if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
       		break;
       	  }
       	  else {
       		currentindex++;
       	  }
       	}
       }
       if ( currentindex == bytelength-1) {
     	fis.close();
     	return false;
       }
       // now pointing at start of a segmentation, find start/end of next Doc
       while (10 == buffer[currentindex]) {
     	currentindex++;
     	if (currentindex == bytelength) {
     	  if (lastblock) {
     		fis.close();
     		return false;
     	  }
           // read next block and continue looking for end of Doc
     	  int len = fis.read(buffer,bytelength,bytelength);
     	  if (len <= 0) {
             throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
     	  }
     	  fis.close();
     	  spilled = true;
     	  bytelength += len;
     	  return findnextdoc(NextDoc.SEP_IN_LASTBLOCK);
     	}
       }
     }

     startloc = currentindex;
     // find end of Doc
     int endloc=0;
     while (currentindex < (bytelength-1)) {
       if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
     	endloc = currentindex - 1;
       	break;
       }
       else {
     	currentindex++;
       }
     }

       if (endloc == 0) {
     	if (lastblock) {
     	  endloc = bytelength-1;
     	}
     	else {
     	  // read next block and continue looking for end of Doc
           int len = fis.read(buffer,bytelength,bytelength);
           if (len <= 0) {
         	throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
           }
           fis.close();
           spilled = true;
           bytelength += len;
     	}
         while (currentindex < (bytelength-1)) {
           if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
         	endloc = currentindex - 1;
           	break;
           }
           else {
           	currentindex++;
           }
         }
         endloc = currentindex - 1;
       }
       byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
       nextDoc = new String(docbytes, encoding);
       nextDocOffset = startloc;
       return true;
   }

   private String getNextDocument() {
     return nextDoc;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.ducc.sampleapps;

	/*
	* This sample Cas Multiplier uses paragraph boundaries to segment a text file,
	* or a part of a text file, into multiple documents. A child CAS is created
	* for each document. Paragraphs that cross block boundaries are processed
	* in the block where they started. An error is thrown if a paragraph crosses
	* two block boundaries.
	*
	* See more information in DUCC Book chapters on sample applications.
	*
	*/

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.nio.channels.FileChannel;
	import java.util.Arrays;
	import java.util.Iterator;

	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.cas.AbstractCas;
	import org.apache.uima.cas.FeatureStructure;
	import org.apache.uima.ducc.Workitem;
	import org.apache.uima.ducc.sampleapps.DuccDocumentInfo;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.resource.ResourceInitializationException;
	import org.apache.uima.util.Level;
	import org.apache.uima.util.Logger;

	public class DuccTextCM extends JCasMultiplier_ImplBase {
	private byte[] buffer = null;
	private int buffsize;
	private FileInputStream fis;
	private String inputFileName;
	private String outputFileName;
	private String language;
	private String encoding;
	private String nextDoc;
	private int nextDocOffset;
	private int bytelength;
	private int blockindex;
	private boolean newWI;
	private boolean spilled;
	private boolean firstdoc;
	private boolean lastblock;
	private int docInWI;
	private long filesize;
	private Workitem wi;
	private int currentindex;
	private Logger logger;
	FileChannel fc;

	private enum NextDoc { FIRSTDOC, SEP_IN_LASTBLOCK, NORMAL };
	private NextDoc strategy;

	private final int DEFAULT_BUFFER_SIZE = 20000000;

	public boolean hasNext() throws AnalysisEngineProcessException {
	if (spilled) {
	return false;
	}
	try {
	return findnextdoc(strategy);
	} catch (IOException e) {
	throw new AnalysisEngineProcessException(e);
	}
	}

	public AbstractCas next() throws AnalysisEngineProcessException {
	JCas newcas = getEmptyJCas();
	newcas.setDocumentText(getNextDocument());
	newcas.setDocumentLanguage(language);
	DuccDocumentInfo di = new DuccDocumentInfo(newcas);
	di.setInputfile(inputFileName);
	di.setOutputfile(outputFileName);
	di.setDocseq(docInWI++);
	di.setByteoffset(wi.getBlockindex() * wi.getBlocksize() + nextDocOffset);
	di.addToIndexes();
	return newcas;
	}

	@Override
	public void process(JCas jcas) throws AnalysisEngineProcessException {
	Iterator<FeatureStructure> fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
	if (!fsit.hasNext()) {
	throw new AnalysisEngineProcessException(new RuntimeException("No workitem FS in CAS"));
	}
	wi = (Workitem) fsit.next();
	logger.log(Level.INFO, "DuccTextCM: "+wi.getInputspec()+" at block "+wi.getBlockindex()+" length "+wi.getBytelength()+
	" offset "+wi.getBlockindex() * wi.getBlocksize()+" outputs "+wi.getOutputspec());
	try {
	openInputFile(wi);
	} catch (IOException e) {
	throw new AnalysisEngineProcessException(e);
	}

	if (buffer == null) {
	if (wi.getBlocksize()>0) {
	buffer = new byte[wi.getBlocksize() * 2];
	buffsize = wi.getBlocksize() * 2;
	}
	else {
	buffer = new byte[DEFAULT_BUFFER_SIZE];
	buffsize = DEFAULT_BUFFER_SIZE;
	}
	}
	else {
	if (wi.getBytelength() > buffsize) {
	buffer = new byte[wi.getBytelength() * 2];
	buffsize = wi.getBytelength();
	}
	}

	spilled = false;
	docInWI = 0;
	strategy = (blockindex == 0) ? NextDoc.FIRSTDOC : NextDoc.NORMAL;
	}


	public void initialize(UimaContext aContext) throws ResourceInitializationException {
	super.initialize(aContext);
	logger = aContext.getLogger();
	}


	private void openInputFile(Workitem wi) throws IOException {
	inputFileName = wi.getInputspec();
	outputFileName = wi.getOutputspec();
	bytelength = wi.getBytelength();
	blockindex = wi.getBlockindex();
	lastblock = wi.getLastBlock();
	language = wi.getLanguage();
	fis = new FileInputStream(new File(inputFileName));
	encoding = (null==wi.getEncoding()) ? "UTF-8" : wi.getEncoding();
	fc = fis.getChannel();
	long start = wi.getBlockindex() * wi.getBlocksize();
	filesize = fc.size();
	if (start > filesize) {
	throw new IOException("Specifid start position beyond end of input file "+inputFileName);
	}
	fis.skip(start);
	newWI = true;
	}

	private boolean findnextdoc(NextDoc condition) throws IOException {
	int startloc=-1;

	if (newWI) {
	newWI = false;
	int len = fis.read(buffer,0,bytelength);
	if (len != bytelength) {
	throw new IOException("Read "+len+" bytes, expected "+bytelength);
	}
	currentindex = 0;
	}

	if (condition.equals(NextDoc.SEP_IN_LASTBLOCK)) {
	// separator found at end of last block
	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
	return false;
	}
	if (10 == buffer[currentindex]) {
	currentindex++; // point at first char in Doc
	}
	startloc=currentindex;

	// find end of next doc
	int endloc=0;
	while (currentindex < (bytelength-1)) {
	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
	endloc = currentindex - 1;
	break;
	}
	else {
	currentindex++;
	}
	}
	if (endloc == 0) {
	throw new RuntimeException("Document larger than "+bytelength+" found in "+inputFileName+" block "+blockindex);
	}
	byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
	nextDoc = new String(docbytes, encoding);
	nextDocOffset = startloc;
	return true;
	}

	if (condition.equals(NextDoc.FIRSTDOC)) {
	// special handling at beginning of first block
	// skip any leading EOL to find start of first doc
	// only execute this once
	strategy = NextDoc.NORMAL;
	while (10 == buffer[currentindex]) {
	currentindex++;
	if (currentindex == bytelength) {
	if (firstdoc) {
	return false; // nothing but newlines in this block
	}
	}
	}
	}

	if (condition.equals(NextDoc.NORMAL)) {
	// currentindex either pointing at start of a segmentation, or
	// if a new block then possibly the middle of a previous document
	if (!(10 == buffer[currentindex] && 10 == buffer[currentindex+1])) {
	// in the middle of a spilled Doc. Find next segmentation
	while (currentindex < (bytelength-1)) {
	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
	break;
	}
	else {
	currentindex++;
	}
	}
	}
	if ( currentindex == bytelength-1) {
	fis.close();
	return false;
	}
	// now pointing at start of a segmentation, find start/end of next Doc
	while (10 == buffer[currentindex]) {
	currentindex++;
	if (currentindex == bytelength) {
	if (lastblock) {
	fis.close();
	return false;
	}
	// read next block and continue looking for end of Doc
	int len = fis.read(buffer,bytelength,bytelength);
	if (len <= 0) {
	throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
	}
	fis.close();
	spilled = true;
	bytelength += len;
	return findnextdoc(NextDoc.SEP_IN_LASTBLOCK);
	}
	}
	}

	startloc = currentindex;
	// find end of Doc
	int endloc=0;
	while (currentindex < (bytelength-1)) {
	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
	endloc = currentindex - 1;
	break;
	}
	else {
	currentindex++;
	}
	}

	if (endloc == 0) {
	if (lastblock) {
	endloc = bytelength-1;
	}
	else {
	// read next block and continue looking for end of Doc
	int len = fis.read(buffer,bytelength,bytelength);
	if (len <= 0) {
	throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
	}
	fis.close();
	spilled = true;
	bytelength += len;
	}
	while (currentindex < (bytelength-1)) {
	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
	endloc = currentindex - 1;
	break;
	}
	else {
	currentindex++;
	}
	}
	endloc = currentindex - 1;
	}
	byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
	nextDoc = new String(docbytes, encoding);
	nextDocOffset = startloc;
	return true;
	}

	private String getNextDocument() {
	return nextDoc;
	}

	}