blob: c659ec40f91b6cc75673810f078987596929e963 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.examples.wikisearch.ingest;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map.Entry;
import junit.framework.Assert;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.mock.MockInstance;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.ContextFactory;
import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.junit.Before;
/**
* Load some data into mock accumulo
*/
public class WikipediaMapperTest {
private static final String METADATA_TABLE_NAME = "wikiMetadata";
private static final String TABLE_NAME = "wiki";
private static final String INDEX_TABLE_NAME = "wikiIndex";
private static final String RINDEX_TABLE_NAME = "wikiReverseIndex";
private class MockAccumuloRecordWriter extends RecordWriter<Text,Mutation> {
@Override
public void write(Text key, Mutation value) throws IOException, InterruptedException {
try {
writerMap.get(key).addMutation(value);
} catch (MutationsRejectedException e) {
throw new IOException("Error adding mutation", e);
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
try {
for (BatchWriter w : writerMap.values()) {
w.flush();
w.close();
}
} catch (MutationsRejectedException e) {
throw new IOException("Error closing Batch Writer", e);
}
}
}
private Connector c = null;
private Configuration conf = new Configuration();
private HashMap<Text,BatchWriter> writerMap = new HashMap<Text,BatchWriter>();
@Before
public void setup() throws Exception {
conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME);
conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1");
conf.set(WikipediaConfiguration.NUM_GROUPS, "1");
MockInstance i = new MockInstance();
c = i.getConnector("root", "pass");
c.tableOperations().delete(METADATA_TABLE_NAME);
c.tableOperations().delete(TABLE_NAME);
c.tableOperations().delete(INDEX_TABLE_NAME);
c.tableOperations().delete(RINDEX_TABLE_NAME);
c.tableOperations().create(METADATA_TABLE_NAME);
c.tableOperations().create(TABLE_NAME);
c.tableOperations().create(INDEX_TABLE_NAME);
c.tableOperations().create(RINDEX_TABLE_NAME);
writerMap.put(new Text(METADATA_TABLE_NAME), c.createBatchWriter(METADATA_TABLE_NAME, 1000L, 1000L, 1));
writerMap.put(new Text(TABLE_NAME), c.createBatchWriter(TABLE_NAME, 1000L, 1000L, 1));
writerMap.put(new Text(INDEX_TABLE_NAME), c.createBatchWriter(INDEX_TABLE_NAME, 1000L, 1000L, 1));
writerMap.put(new Text(RINDEX_TABLE_NAME), c.createBatchWriter(RINDEX_TABLE_NAME, 1000L, 1000L, 1));
TaskAttemptContext context = ContextFactory.createTaskAttemptContext(conf);
RawLocalFileSystem fs = new RawLocalFileSystem();
fs.setConf(conf);
URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml");
Assert.assertNotNull(url);
File data = new File(url.toURI());
Path tmpFile = new Path(data.getAbsolutePath());
// Setup the Mapper
InputSplit split = new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null);
AggregatingRecordReader rr = new AggregatingRecordReader();
Path ocPath = new Path(tmpFile, "oc");
OutputCommitter oc = new FileOutputCommitter(ocPath, context);
fs.deleteOnExit(ocPath);
StandaloneStatusReporter sr = new StandaloneStatusReporter();
rr.initialize(split, context);
MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter();
WikipediaMapper mapper = new WikipediaMapper();
// Load data into Mock Accumulo
Mapper<LongWritable,Text,Text,Mutation>.Context con = ContextFactory.createMapContext(mapper, context, rr, rw, oc, sr, split);
mapper.run(con);
// Flush and close record writers.
rw.close(context);
}
private void debugQuery(String tableName) throws Exception {
Scanner s = c.createScanner(tableName, new Authorizations("all"));
Range r = new Range();
s.setRange(r);
for (Entry<Key,Value> entry : s)
System.out.println(entry.getKey().toString() + " " + entry.getValue().toString());
}
public void testViewAllData() throws Exception {
debugQuery(METADATA_TABLE_NAME);
debugQuery(TABLE_NAME);
debugQuery(INDEX_TABLE_NAME);
debugQuery(RINDEX_TABLE_NAME);
}
}