solr/solrj/src/java/org/apache/solr/client/solrj/response/AnalysisResponseBase.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.client.solrj.response;

 import org.apache.solr.common.util.NamedList;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;

 /**
  * A base class for all analysis responses.
  *
  *
  * @since solr 1.4
  */
 public class AnalysisResponseBase extends SolrResponseBase {

   /**
    * Parses the given named list and builds a list of analysis phases form it. Expects a named list of the form:
    * <br>
    * <pre><code>
    *  &lt;lst name="index"&gt;
    *      &lt;arr name="Tokenizer"&gt;
    *          &lt;str name="text"&gt;the_text&lt;/str&gt;
    *          &lt;str name="rawText"&gt;the_raw_text&lt;/str&gt; (optional)
    *          &lt;str name="type"&gt;the_type&lt;/str&gt;
    *          &lt;int name="start"&gt;1&lt;/str&gt;
    *          &lt;int name="end"&gt;3&lt;/str&gt;
    *          &lt;int name="position"&gt;1&lt;/str&gt;
    *          &lt;bool name="match"&gt;true | false&lt;/bool&gt; (optional)
    *      &lt;/arr&gt;
    *      &lt;arr name="Filter1"&gt;
    *          &lt;str name="text"&gt;the_text&lt;/str&gt;
    *          &lt;str name="rawText"&gt;the_raw_text&lt;/str&gt; (optional)
    *          &lt;str name="type"&gt;the_type&lt;/str&gt;
    *          &lt;int name="start"&gt;1&lt;/str&gt;
    *          &lt;int name="end"&gt;3&lt;/str&gt;
    *          &lt;int name="position"&gt;1&lt;/str&gt;
    *          &lt;bool name="match"&gt;true | false&lt;/bool&gt; (optional)
    *      &lt;/arr&gt;
    *      ...
    *  &lt;/lst&gt;
    * </code></pre>
    *
    * The special case is a CharacterFilter that just returns a string, which we then map to a single token without type.
    *
    * @param phaseNL The names list to parse.
    *
    * @return The built analysis phases list.
    */
   protected List<AnalysisPhase> buildPhases(NamedList<Object> phaseNL) {
     List<AnalysisPhase> phases = new ArrayList<>(phaseNL.size());
     for (Map.Entry<String, Object> phaseEntry : phaseNL) {
       AnalysisPhase phase = new AnalysisPhase(phaseEntry.getKey());
       Object phaseValue = phaseEntry.getValue();

       if (phaseValue instanceof String) {
         // We are looking at CharacterFilter, which - exceptionally - returns a string
         TokenInfo tokenInfo = buildTokenInfoFromString((String) phaseValue);
         phase.addTokenInfo(tokenInfo);
       } else {
         @SuppressWarnings({"unchecked"})
         List<NamedList<Object>> tokens = (List<NamedList<Object>>) phaseEntry.getValue();
         for (NamedList<Object> token : tokens) {
           TokenInfo tokenInfo = buildTokenInfo(token);
           phase.addTokenInfo(tokenInfo);
         }
       }
       phases.add(phase);
     }
     return phases;
   }

   /**
    * Convert a string value (from CharacterFilter) into a TokenInfo for its value full span.
    * @param value String value
    * @return The built token info (with type set to null)
    */
   protected TokenInfo buildTokenInfoFromString(String value) {
     return new TokenInfo(value, value, null, 0, value.length(), 1, false);
   }

   /**
    * Parses the given named list and builds a token infoform it. Expects a named list of the form:
    * <br>
    * <pre><code>
    *  &lt;arr name="Tokenizer"&gt;
    *      &lt;str name="text"&gt;the_text&lt;/str&gt;
    *      &lt;str name="rawText"&gt;the_raw_text&lt;/str&gt; (optional)
    *      &lt;str name="type"&gt;the_type&lt;/str&gt;
    *      &lt;int name="start"&gt;1&lt;/str&gt;
    *      &lt;int name="end"&gt;3&lt;/str&gt;
    *      &lt;int name="position"&gt;1&lt;/str&gt;
    *      &lt;bool name="match"&gt;true | false&lt;/bool&gt; (optional)
    *  &lt;/arr&gt;
    * </code></pre>
    *
    * @param tokenNL The named list to parse.
    *
    * @return The built token info.
    */
   protected TokenInfo buildTokenInfo(NamedList<Object> tokenNL) {
     String text = (String) tokenNL.get("text");
     String rawText = (String) tokenNL.get("rawText");
     String type = (String) tokenNL.get("type");
     int start = (Integer) tokenNL.get("start");
     int end = (Integer) tokenNL.get("end");
     int position = (Integer) tokenNL.get("position");
     Boolean match = (Boolean) tokenNL.get("match");
     return new TokenInfo(text, rawText, type, start, end, position, (match == null ? false : match));
   }


   //================================================= Inner Classes ==================================================

   /**
    * A phase in the analysis process. The phase holds the tokens produced in this phase and the name of the class that
    * produced them.
    */
   public static class AnalysisPhase {

     private final String className;
     private List<TokenInfo> tokens = new ArrayList<>();

     AnalysisPhase(String className) {
       this.className = className;
     }

     /**
      * The name of the class (analyzer, tokenzier, or filter) that produced the token stream for this phase.
      *
      * @return The name of the class that produced the token stream for this phase.
      */
     public String getClassName() {
       return className;
     }

     private void addTokenInfo(TokenInfo tokenInfo) {
       tokens.add(tokenInfo);
     }

     /**
      * Returns a list of tokens which represent the token stream produced in this phase.
      *
      * @return A list of tokens which represent the token stream produced in this phase.
      */
     public List<TokenInfo> getTokens() {
       return tokens;
     }

   }

   /**
    * Holds all information of a token as part of an analysis phase.
    */
   public static class TokenInfo {

     private final String text;
     private final String rawText;
     private final String type;
     private final int start;
     private final int end;
     private final int position;
     private final boolean match;

     /**
      * Constructs a new TokenInfo.
      *
      * @param text     The text of the token
      * @param rawText  The raw text of the token. If the token is stored in the index in a special format (e.g.
      *                 dates or padded numbers) this argument should hold this value. If the token is stored as is,
      *                 then this value should be {@code null}.
      * @param type     The type fo the token (typically either {@code word} or {@code <ALPHANUM>} though it depends
      *                 on the tokenizer/filter used).
      * @param start    The start position of the token in the original text where it was extracted from.
      * @param end      The end position of the token in the original text where it was extracted from.
      * @param position The position of the token within the token stream.
      * @param match    Indicates whether this token matches one of the the query tokens.
      */
     TokenInfo(String text, String rawText, String type, int start, int end, int position, boolean match) {
       this.text = text;
       this.rawText = rawText;
       this.type = type;
       this.start = start;
       this.end = end;
       this.position = position;
       this.match = match;
     }

     /**
      * Returns the text of the token.
      *
      * @return The text of the token.
      */
     public String getText() {
       return text;
     }

     /**
      * Returns the raw text of the token. If the token is index in a special format (e.g. date or paddded numbers)
      * it will be returned as the raw text. Returns {@code null} if the token is indexed as is.
      *
      * @return Returns the raw text of the token.
      */
     public String getRawText() {
       return rawText;
     }

     /**
      * Returns the type of the token. Typically this will be {@code word} or {@code <ALPHANUM>}, but it really
      * depends on the tokenizer and filters that are used.
      *
      * @return The type of the token.
      */
     public String getType() {
       return type;
     }

     /**
      * Returns the start position of this token within the text it was originally extracted from.
      *
      * @return The start position of this token within the text it was originally extracted from.
      */
     public int getStart() {
       return start;
     }

     /**
      * Returns the end position of this token within the text it was originally extracted from.
      *
      * @return The end position of this token within the text it was originally extracted from.
      */
     public int getEnd() {
       return end;
     }

     /**
      * Returns the position of this token within the produced token stream.
      *
      * @return The position of this token within the produced token stream.
      */
     public int getPosition() {
       return position;
     }

     /**
      * Returns whether this token matches one of the query tokens (if query analysis is performed).
      *
      * @return Whether this token matches one of the query tokens (if query analysis is performed).
      */
     public boolean isMatch() {
       return match;
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.client.solrj.response;

	import org.apache.solr.common.util.NamedList;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;

	/**
	* A base class for all analysis responses.
	*
	*
	* @since solr 1.4
	*/
	public class AnalysisResponseBase extends SolrResponseBase {

	/**
	* Parses the given named list and builds a list of analysis phases form it. Expects a named list of the form:
	* <br>
	* <pre><code>
	* <lst name="index">
	* <arr name="Tokenizer">
	* <str name="text">the_text</str>
	* <str name="rawText">the_raw_text</str> (optional)
	* <str name="type">the_type</str>
	* <int name="start">1</str>
	* <int name="end">3</str>
	* <int name="position">1</str>
	* <bool name="match">true \| false</bool> (optional)
	* </arr>
	* <arr name="Filter1">
	* <str name="text">the_text</str>
	* <str name="rawText">the_raw_text</str> (optional)
	* <str name="type">the_type</str>
	* <int name="start">1</str>
	* <int name="end">3</str>
	* <int name="position">1</str>
	* <bool name="match">true \| false</bool> (optional)
	* </arr>
	* ...
	* </lst>
	* </code></pre>
	*
	* The special case is a CharacterFilter that just returns a string, which we then map to a single token without type.
	*
	* @param phaseNL The names list to parse.
	*
	* @return The built analysis phases list.
	*/
	protected List<AnalysisPhase> buildPhases(NamedList<Object> phaseNL) {
	List<AnalysisPhase> phases = new ArrayList<>(phaseNL.size());
	for (Map.Entry<String, Object> phaseEntry : phaseNL) {
	AnalysisPhase phase = new AnalysisPhase(phaseEntry.getKey());
	Object phaseValue = phaseEntry.getValue();

	if (phaseValue instanceof String) {
	// We are looking at CharacterFilter, which - exceptionally - returns a string
	TokenInfo tokenInfo = buildTokenInfoFromString((String) phaseValue);
	phase.addTokenInfo(tokenInfo);
	} else {
	@SuppressWarnings({"unchecked"})
	List<NamedList<Object>> tokens = (List<NamedList<Object>>) phaseEntry.getValue();
	for (NamedList<Object> token : tokens) {
	TokenInfo tokenInfo = buildTokenInfo(token);
	phase.addTokenInfo(tokenInfo);
	}
	}
	phases.add(phase);
	}
	return phases;
	}

	/**
	* Convert a string value (from CharacterFilter) into a TokenInfo for its value full span.
	* @param value String value
	* @return The built token info (with type set to null)
	*/
	protected TokenInfo buildTokenInfoFromString(String value) {
	return new TokenInfo(value, value, null, 0, value.length(), 1, false);
	}

	/**
	* Parses the given named list and builds a token infoform it. Expects a named list of the form:
	* <br>
	* <pre><code>
	* <arr name="Tokenizer">
	* <str name="text">the_text</str>
	* <str name="rawText">the_raw_text</str> (optional)
	* <str name="type">the_type</str>
	* <int name="start">1</str>
	* <int name="end">3</str>
	* <int name="position">1</str>
	* <bool name="match">true \| false</bool> (optional)
	* </arr>
	* </code></pre>
	*
	* @param tokenNL The named list to parse.
	*
	* @return The built token info.
	*/
	protected TokenInfo buildTokenInfo(NamedList<Object> tokenNL) {
	String text = (String) tokenNL.get("text");
	String rawText = (String) tokenNL.get("rawText");
	String type = (String) tokenNL.get("type");
	int start = (Integer) tokenNL.get("start");
	int end = (Integer) tokenNL.get("end");
	int position = (Integer) tokenNL.get("position");
	Boolean match = (Boolean) tokenNL.get("match");
	return new TokenInfo(text, rawText, type, start, end, position, (match == null ? false : match));
	}


	//================================================= Inner Classes ==================================================

	/**
	* A phase in the analysis process. The phase holds the tokens produced in this phase and the name of the class that
	* produced them.
	*/
	public static class AnalysisPhase {

	private final String className;
	private List<TokenInfo> tokens = new ArrayList<>();

	AnalysisPhase(String className) {
	this.className = className;
	}

	/**
	* The name of the class (analyzer, tokenzier, or filter) that produced the token stream for this phase.
	*
	* @return The name of the class that produced the token stream for this phase.
	*/
	public String getClassName() {
	return className;
	}

	private void addTokenInfo(TokenInfo tokenInfo) {
	tokens.add(tokenInfo);
	}

	/**
	* Returns a list of tokens which represent the token stream produced in this phase.
	*
	* @return A list of tokens which represent the token stream produced in this phase.
	*/
	public List<TokenInfo> getTokens() {
	return tokens;
	}

	}

	/**
	* Holds all information of a token as part of an analysis phase.
	*/
	public static class TokenInfo {

	private final String text;
	private final String rawText;
	private final String type;
	private final int start;
	private final int end;
	private final int position;
	private final boolean match;

	/**
	* Constructs a new TokenInfo.
	*
	* @param text The text of the token
	* @param rawText The raw text of the token. If the token is stored in the index in a special format (e.g.
	* dates or padded numbers) this argument should hold this value. If the token is stored as is,
	* then this value should be {@code null}.
	* @param type The type fo the token (typically either {@code word} or {@code <ALPHANUM>} though it depends
	* on the tokenizer/filter used).
	* @param start The start position of the token in the original text where it was extracted from.
	* @param end The end position of the token in the original text where it was extracted from.
	* @param position The position of the token within the token stream.
	* @param match Indicates whether this token matches one of the the query tokens.
	*/
	TokenInfo(String text, String rawText, String type, int start, int end, int position, boolean match) {
	this.text = text;
	this.rawText = rawText;
	this.type = type;
	this.start = start;
	this.end = end;
	this.position = position;
	this.match = match;
	}

	/**
	* Returns the text of the token.
	*
	* @return The text of the token.
	*/
	public String getText() {
	return text;
	}

	/**
	* Returns the raw text of the token. If the token is index in a special format (e.g. date or paddded numbers)
	* it will be returned as the raw text. Returns {@code null} if the token is indexed as is.
	*
	* @return Returns the raw text of the token.
	*/
	public String getRawText() {
	return rawText;
	}

	/**
	* Returns the type of the token. Typically this will be {@code word} or {@code <ALPHANUM>}, but it really
	* depends on the tokenizer and filters that are used.
	*
	* @return The type of the token.
	*/
	public String getType() {
	return type;
	}

	/**
	* Returns the start position of this token within the text it was originally extracted from.
	*
	* @return The start position of this token within the text it was originally extracted from.
	*/
	public int getStart() {
	return start;
	}

	/**
	* Returns the end position of this token within the text it was originally extracted from.
	*
	* @return The end position of this token within the text it was originally extracted from.
	*/
	public int getEnd() {
	return end;
	}

	/**
	* Returns the position of this token within the produced token stream.
	*
	* @return The position of this token within the produced token stream.
	*/
	public int getPosition() {
	return position;
	}

	/**
	* Returns whether this token matches one of the query tokens (if query analysis is performed).
	*
	* @return Whether this token matches one of the query tokens (if query analysis is performed).
	*/
	public boolean isMatch() {
	return match;
	}
	}

	}