blob: a0e847158e15fcc19ecebed56507f3b812bfa0fb [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.contrib.text;
import static org.apache.crunch.contrib.text.Extractors.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.util.Collection;
import org.apache.crunch.PCollection;
import org.apache.crunch.Pair;
import org.apache.crunch.Tuple3;
import org.apache.crunch.Tuple4;
import org.apache.crunch.TupleN;
import org.apache.crunch.impl.mem.MemPipeline;
import org.apache.crunch.types.avro.Avros;
import org.junit.Test;
import com.google.common.collect.ImmutableList;
public class ParseTest {
@Test
public void testInt() {
assertEquals(Integer.valueOf(1729), xint().extract("1729"));
assertEquals(Integer.valueOf(321), xint(321).extract("foo"));
}
@Test
public void testString() {
assertEquals("bar", xstring().extract("bar"));
}
@Test
public void testPairWithDrop() {
TokenizerFactory sf = TokenizerFactory.builder().delimiter(",").drop(0, 2).build();
assertEquals(Pair.of(1, "abc"), xpair(sf, xint(), xstring()).extract("foo,1,17.29,abc"));
}
@Test
public void testTripsWithSkip() {
TokenizerFactory sf = TokenizerFactory.builder().delimiter(";").skip("^foo").build();
assertEquals(Tuple3.of(17, "abc", 3.4f),
xtriple(sf, xint(), xstring(), xfloat()).extract("foo17;abc;3.4"));
}
@Test
public void testTripsWithKeep() {
TokenizerFactory sf = TokenizerFactory.builder().delimiter(";").keep(1, 2, 3).build();
assertEquals(Tuple3.of(17, "abc", 3.4f),
xtriple(sf, xint(), xstring(), xfloat()).extract("foo;17;abc;3.4"));
}
@Test
public void testQuadsWithWhitespace() {
TokenizerFactory sf = TokenizerFactory.getDefaultInstance();
assertEquals(Tuple4.of(1.3, "foo", true, 1L),
xquad(sf, xdouble(), xstring(), xboolean(), xlong()).extract("1.3 foo true 1"));
}
@Test
public void testTupleN() {
TokenizerFactory sf = TokenizerFactory.builder().delimiter(",").build();
assertEquals(new TupleN(1, false, true, 2, 3),
xtupleN(sf, xint(), xboolean(), xboolean(), xint(), xint()).extract("1,false,true,2,3"));
}
@Test
public void testCollections() {
TokenizerFactory sf = TokenizerFactory.builder().delimiter(";").build();
// Use 3000 as the default for values we can't parse
Extractor<Collection<Integer>> x = xcollect(sf, xint(3000));
assertEquals(ImmutableList.of(1, 2, 3), x.extract("1;2;3"));
assertFalse(x.errorOnLastRecord());
assertEquals(ImmutableList.of(17, 29, 3000), x.extract("17;29;a"));
assertTrue(x.errorOnLastRecord());
assertEquals(1, x.getStats().getErrorCount());
}
@Test
public void testNestedComposites() {
TokenizerFactory outer = TokenizerFactory.builder().delimiter(";").build();
TokenizerFactory inner = TokenizerFactory.builder().delimiter(",").build();
Extractor<Pair<Pair<Long, Integer>, Tuple3<String, Integer, Float>>> extractor =
xpair(outer, xpair(inner, xlong(), xint()), xtriple(inner, xstring(), xint(), xfloat()));
assertEquals(Pair.of(Pair.of(1L, 2), Tuple3.of("a", 17, 29.0f)),
extractor.extract("1,2;a,17,29"));
}
@Test
public void testParse() {
TokenizerFactory sf = TokenizerFactory.builder().delimiter(",").build();
PCollection<String> lines = MemPipeline.typedCollectionOf(Avros.strings(), "1,3.0");
Iterable<Pair<Integer, Float>> it = Parse.parse("test", lines,
xpair(sf, xint(), xfloat())).materialize();
assertEquals(ImmutableList.of(Pair.of(1, 3.0f)), it);
}
}