blob: 475238bbb9b0e154497911ef893838c40939ee26 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
/**
* Segment a sentence of Chinese text into words.
* @lucene.experimental
*/
class WordSegmenter {
private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
private SegTokenFilter tokenFilter = new SegTokenFilter();
/**
* Segment a sentence into words with {@link HHMMSegmenter}
*
* @param sentence input sentence
* @param startOffset start offset of sentence
* @return {@link List} of {@link SegToken}
*/
public List<SegToken> segmentSentence(String sentence, int startOffset) {
List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
List<SegToken> result = Collections.emptyList();
if (segTokenList.size() > 2) // if it's not an empty sentence
result = segTokenList.subList(1, segTokenList.size() - 1);
for (SegToken st : result)
convertSegToken(st, sentence, startOffset);
return result;
}
/**
* Process a {@link SegToken} so that it is ready for indexing.
*
* This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
*
* @param st input {@link SegToken}
* @param sentence associated Sentence
* @param sentenceStartOffset offset into sentence
* @return Lucene {@link SegToken}
*/
public SegToken convertSegToken(SegToken st, String sentence,
int sentenceStartOffset) {
switch (st.wordType) {
case WordType.STRING:
case WordType.NUMBER:
case WordType.FULLWIDTH_NUMBER:
case WordType.FULLWIDTH_STRING:
st.charArray = sentence.substring(st.startOffset, st.endOffset)
.toCharArray();
break;
default:
break;
}
st = tokenFilter.filter(st);
st.startOffset += sentenceStartOffset;
st.endOffset += sentenceStartOffset;
return st;
}
}