| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.cn.smart; |
| |
| import java.util.Collections; |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter; |
| import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; |
| import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; |
| |
| /** |
| * Segment a sentence of Chinese text into words. |
| * @lucene.experimental |
| */ |
| class WordSegmenter { |
| |
| private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter(); |
| |
| private SegTokenFilter tokenFilter = new SegTokenFilter(); |
| |
| /** |
| * Segment a sentence into words with {@link HHMMSegmenter} |
| * |
| * @param sentence input sentence |
| * @param startOffset start offset of sentence |
| * @return {@link List} of {@link SegToken} |
| */ |
| public List<SegToken> segmentSentence(String sentence, int startOffset) { |
| |
| List<SegToken> segTokenList = hhmmSegmenter.process(sentence); |
| // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END |
| List<SegToken> result = Collections.emptyList(); |
| |
| if (segTokenList.size() > 2) // if it's not an empty sentence |
| result = segTokenList.subList(1, segTokenList.size() - 1); |
| |
| for (SegToken st : result) |
| convertSegToken(st, sentence, startOffset); |
| |
| return result; |
| } |
| |
| /** |
| * Process a {@link SegToken} so that it is ready for indexing. |
| * |
| * This method calculates offsets and normalizes the token with {@link SegTokenFilter}. |
| * |
| * @param st input {@link SegToken} |
| * @param sentence associated Sentence |
| * @param sentenceStartOffset offset into sentence |
| * @return Lucene {@link SegToken} |
| */ |
| public SegToken convertSegToken(SegToken st, String sentence, |
| int sentenceStartOffset) { |
| |
| switch (st.wordType) { |
| case WordType.STRING: |
| case WordType.NUMBER: |
| case WordType.FULLWIDTH_NUMBER: |
| case WordType.FULLWIDTH_STRING: |
| st.charArray = sentence.substring(st.startOffset, st.endOffset) |
| .toCharArray(); |
| break; |
| default: |
| break; |
| } |
| |
| st = tokenFilter.filter(st); |
| st.startOffset += sentenceStartOffset; |
| st.endOffset += sentenceStartOffset; |
| return st; |
| } |
| } |