mapreduce/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.contrib.index.example;

 import java.io.BufferedInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
 import org.apache.hadoop.contrib.index.mapred.DocumentID;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.RecordReader;

 /**
  * A simple RecordReader for LineDoc for plain text files where each line is a
  * doc. Each line is as follows: documentID<SPACE>op<SPACE>content<EOF>,
  * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete"
  * for delete, or "u", "upd" or "update" for update.
  */
 public class LineDocRecordReader implements
     RecordReader<DocumentID, LineDocTextAndOp> {
   private static final char SPACE = ' ';
   private static final char EOL = '\n';

   private long start;
   private long pos;
   private long end;
   private BufferedInputStream in;
   private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);

   /**
    * Provide a bridge to get the bytes from the ByteArrayOutputStream without
    * creating a new byte array.
    */
   private static class TextStuffer extends OutputStream {
     public Text target;

     public void write(int b) {
       throw new UnsupportedOperationException("write(byte) not supported");
     }

     public void write(byte[] data, int offset, int len) throws IOException {
       target.set(data, offset, len);
     }
   }

   private TextStuffer bridge = new TextStuffer();

   /**
    * Constructor
    * @param job
    * @param split
    * @throws IOException
    */
   public LineDocRecordReader(Configuration job, FileSplit split)
       throws IOException {
     long start = split.getStart();
     long end = start + split.getLength();
     final Path file = split.getPath();

     // open the file and seek to the start of the split
     FileSystem fs = file.getFileSystem(job);
     FSDataInputStream fileIn = fs.open(split.getPath());
     InputStream in = fileIn;
     boolean skipFirstLine = false;
     if (start != 0) {
       skipFirstLine = true; // wait till BufferedInputStream to skip
       --start;
       fileIn.seek(start);
     }

     this.in = new BufferedInputStream(in);
     if (skipFirstLine) { // skip first line and re-establish "start".
       start += LineDocRecordReader.readData(this.in, null, EOL);
     }
     this.start = start;
     this.pos = start;
     this.end = end;
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.mapred.RecordReader#close()
    */
   public void close() throws IOException {
     in.close();
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.mapred.RecordReader#createKey()
    */
   public DocumentID createKey() {
     return new DocumentID();
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.mapred.RecordReader#createValue()
    */
   public LineDocTextAndOp createValue() {
     return new LineDocTextAndOp();
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.mapred.RecordReader#getPos()
    */
   public long getPos() throws IOException {
     return pos;
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.mapred.RecordReader#getProgress()
    */
   public float getProgress() throws IOException {
     if (start == end) {
       return 0.0f;
     } else {
       return Math.min(1.0f, (pos - start) / (float) (end - start));
     }
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object)
    */
   public synchronized boolean next(DocumentID key, LineDocTextAndOp value)
       throws IOException {
     if (pos >= end) {
       return false;
     }

     // key is document id, which are bytes until first space
     if (!readInto(key.getText(), SPACE)) {
       return false;
     }

     // read operation: i/d/u, or ins/del/upd, or insert/delete/update
     Text opText = new Text();
     if (!readInto(opText, SPACE)) {
       return false;
     }
     String opStr = opText.toString();
     DocumentAndOp.Op op;
     if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) {
       op = DocumentAndOp.Op.INSERT;
     } else if (opStr.equals("d") || opStr.equals("del")
         || opStr.equals("delete")) {
       op = DocumentAndOp.Op.DELETE;
     } else if (opStr.equals("u") || opStr.equals("upd")
         || opStr.equals("update")) {
       op = DocumentAndOp.Op.UPDATE;
     } else {
       // default is insert
       op = DocumentAndOp.Op.INSERT;
     }
     value.setOp(op);

     if (op == DocumentAndOp.Op.DELETE) {
       return true;
     } else {
       // read rest of the line
       return readInto(value.getText(), EOL);
     }
   }

   private boolean readInto(Text text, char delimiter) throws IOException {
     buffer.reset();
     long bytesRead = readData(in, buffer, delimiter);
     if (bytesRead == 0) {
       return false;
     }
     pos += bytesRead;
     bridge.target = text;
     buffer.writeTo(bridge);
     return true;
   }

   private static long readData(InputStream in, OutputStream out, char delimiter)
       throws IOException {
     long bytes = 0;
     while (true) {

       int b = in.read();
       if (b == -1) {
         break;
       }
       bytes += 1;

       byte c = (byte) b;
       if (c == EOL || c == delimiter) {
         break;
       }

       if (c == '\r') {
         in.mark(1);
         byte nextC = (byte) in.read();
         if (nextC != EOL || c == delimiter) {
           in.reset();
         } else {
           bytes += 1;
         }
         break;
       }

       if (out != null) {
         out.write(c);
       }
     }
     return bytes;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.contrib.index.example;

	import java.io.BufferedInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
	import org.apache.hadoop.contrib.index.mapred.DocumentID;
	import org.apache.hadoop.fs.FSDataInputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapred.FileSplit;
	import org.apache.hadoop.mapred.RecordReader;

	/**
	* A simple RecordReader for LineDoc for plain text files where each line is a
	* doc. Each line is as follows: documentID<SPACE>op<SPACE>content<EOF>,
	* where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete"
	* for delete, or "u", "upd" or "update" for update.
	*/
	public class LineDocRecordReader implements
	RecordReader<DocumentID, LineDocTextAndOp> {
	private static final char SPACE = ' ';
	private static final char EOL = '\n';

	private long start;
	private long pos;
	private long end;
	private BufferedInputStream in;
	private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);

	/**
	* Provide a bridge to get the bytes from the ByteArrayOutputStream without
	* creating a new byte array.
	*/
	private static class TextStuffer extends OutputStream {
	public Text target;

	public void write(int b) {
	throw new UnsupportedOperationException("write(byte) not supported");
	}

	public void write(byte[] data, int offset, int len) throws IOException {
	target.set(data, offset, len);
	}
	}

	private TextStuffer bridge = new TextStuffer();

	/**
	* Constructor
	* @param job
	* @param split
	* @throws IOException
	*/
	public LineDocRecordReader(Configuration job, FileSplit split)
	throws IOException {
	long start = split.getStart();
	long end = start + split.getLength();
	final Path file = split.getPath();

	// open the file and seek to the start of the split
	FileSystem fs = file.getFileSystem(job);
	FSDataInputStream fileIn = fs.open(split.getPath());
	InputStream in = fileIn;
	boolean skipFirstLine = false;
	if (start != 0) {
	skipFirstLine = true; // wait till BufferedInputStream to skip
	--start;
	fileIn.seek(start);
	}

	this.in = new BufferedInputStream(in);
	if (skipFirstLine) { // skip first line and re-establish "start".
	start += LineDocRecordReader.readData(this.in, null, EOL);
	}
	this.start = start;
	this.pos = start;
	this.end = end;
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.mapred.RecordReader#close()
	*/
	public void close() throws IOException {
	in.close();
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.mapred.RecordReader#createKey()
	*/
	public DocumentID createKey() {
	return new DocumentID();
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.mapred.RecordReader#createValue()
	*/
	public LineDocTextAndOp createValue() {
	return new LineDocTextAndOp();
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.mapred.RecordReader#getPos()
	*/
	public long getPos() throws IOException {
	return pos;
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.mapred.RecordReader#getProgress()
	*/
	public float getProgress() throws IOException {
	if (start == end) {
	return 0.0f;
	} else {
	return Math.min(1.0f, (pos - start) / (float) (end - start));
	}
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object)
	*/
	public synchronized boolean next(DocumentID key, LineDocTextAndOp value)
	throws IOException {
	if (pos >= end) {
	return false;
	}

	// key is document id, which are bytes until first space
	if (!readInto(key.getText(), SPACE)) {
	return false;
	}

	// read operation: i/d/u, or ins/del/upd, or insert/delete/update
	Text opText = new Text();
	if (!readInto(opText, SPACE)) {
	return false;
	}
	String opStr = opText.toString();
	DocumentAndOp.Op op;
	if (opStr.equals("i") \|\| opStr.equals("ins") \|\| opStr.equals("insert")) {
	op = DocumentAndOp.Op.INSERT;
	} else if (opStr.equals("d") \|\| opStr.equals("del")
	\|\| opStr.equals("delete")) {
	op = DocumentAndOp.Op.DELETE;
	} else if (opStr.equals("u") \|\| opStr.equals("upd")
	\|\| opStr.equals("update")) {
	op = DocumentAndOp.Op.UPDATE;
	} else {
	// default is insert
	op = DocumentAndOp.Op.INSERT;
	}
	value.setOp(op);

	if (op == DocumentAndOp.Op.DELETE) {
	return true;
	} else {
	// read rest of the line
	return readInto(value.getText(), EOL);
	}
	}

	private boolean readInto(Text text, char delimiter) throws IOException {
	buffer.reset();
	long bytesRead = readData(in, buffer, delimiter);
	if (bytesRead == 0) {
	return false;
	}
	pos += bytesRead;
	bridge.target = text;
	buffer.writeTo(bridge);
	return true;
	}

	private static long readData(InputStream in, OutputStream out, char delimiter)
	throws IOException {
	long bytes = 0;
	while (true) {

	int b = in.read();
	if (b == -1) {
	break;
	}
	bytes += 1;

	byte c = (byte) b;
	if (c == EOL \|\| c == delimiter) {
	break;
	}

	if (c == '\r') {
	in.mark(1);
	byte nextC = (byte) in.read();
	if (nextC != EOL \|\| c == delimiter) {
	in.reset();
	} else {
	bytes += 1;
	}
	break;
	}

	if (out != null) {
	out.write(c);
	}
	}
	return bytes;
	}
	}