blob: 4d46779ecad018a2ef0c292baa9db558b0359628 [file] [log] [blame]
/*
* Copyright (c) 2013 DataTorrent, Inc. ALL Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datatorrent.demos.rollingtopwords;
import com.datatorrent.api.BaseOperator;
import com.datatorrent.api.DefaultInputPort;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.Context.OperatorContext;
import java.util.Arrays;
import java.util.HashSet;
/**
* <p>TwitterStatusWordExtractor class.</p>
*
* @since 0.3.2
*/
public class TwitterStatusWordExtractor extends BaseOperator
{
public HashSet<String> filterList;
public final transient DefaultOutputPort<String> output = new DefaultOutputPort<String>();
public final transient DefaultInputPort<String> input = new DefaultInputPort<String>()
{
@Override
public void process(String text)
{
String strs[] = text.split(" ");
if (strs != null) {
for (String str : strs) {
if (str != null && !filterList.contains(str) ) {
output.emit(str);
}
}
}
}
};
@Override
public void setup(OperatorContext context)
{
this.filterList = new HashSet<String>(Arrays.asList(new String[]{"", " ","I","you","the","a","to","as","he","him","his","her","she","me","can","for","of","and","or","but",
"this","that","!",",",".",":","#","/","@","be","in","out","was","were","is","am","are","so","no","...","my","de","RT","on","que","la","i","your","it","have","with","?","when",
"up","just","do","at","&","-","+","*","\\","y","n","like","se","en","te","el","I'm"}));
}
}