/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;


import java.io.IOException;

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.automaton.Automaton;

// TODO: maybe also toFST?  then we can translate atts into FST outputs/weights

/** Consumes a TokenStream and creates an {@link Automaton}
 *  where the transition labels are UTF8 bytes (or Unicode 
 *  code points if unicodeArcs is true) from the {@link
 *  TermToBytesRefAttribute}.  Between tokens we insert
 *  POS_SEP and for holes we insert HOLE.
 *
 * @lucene.experimental */
public class TokenStreamToAutomaton {

  private boolean preservePositionIncrements;
  private boolean finalOffsetGapAsHole;
  private boolean unicodeArcs;

  /** Sole constructor. */
  public TokenStreamToAutomaton() {
    this.preservePositionIncrements = true;
  }

  /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
  public void setPreservePositionIncrements(boolean enablePositionIncrements) {
    this.preservePositionIncrements = enablePositionIncrements;
  }

  /** If true, any final offset gaps will result in adding a position hole. */
  public void setFinalOffsetGapAsHole(boolean finalOffsetGapAsHole) {
    this.finalOffsetGapAsHole = finalOffsetGapAsHole;
  }

  /** Whether to make transition labels Unicode code points instead of UTF8 bytes, 
   *  <code>false</code> by default */
  public void setUnicodeArcs(boolean unicodeArcs) {
    this.unicodeArcs = unicodeArcs;
  }

  private static class Position implements RollingBuffer.Resettable {
    // Any tokens that ended at our position arrive to this state:
    int arriving = -1;

    // Any tokens that start at our position leave from this state:
    int leaving = -1;

    @Override
    public void reset() {
      arriving = -1;
      leaving = -1;
    }
  }

  private static class Positions extends RollingBuffer<Position> {
    @Override
    protected Position newInstance() {
      return new Position();
    }
  }

  /** Subclass and implement this if you need to change the
   *  token (such as escaping certain bytes) before it's
   *  turned into a graph. */ 
  protected BytesRef changeToken(BytesRef in) {
    return in;
  }

  /** We create transition between two adjacent tokens. */
  public static final int POS_SEP = 0x001f;

  /** We add this arc to represent a hole. */
  public static final int HOLE = 0x001e;

  /** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes (or Unicode code points 
   *  if unicodeArcs = true) from each term. */
  public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton.Builder builder = new Automaton.Builder();
    builder.createState();

    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

    in.reset();

    // Only temporarily holds states ahead of our current
    // position:

    final RollingBuffer<Position> positions = new Positions();

    int pos = -1;
    int freedPos = 0;
    Position posData = null;
    int maxOffset = 0;
    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      if (preservePositionIncrements == false && posInc > 1) {
        posInc = 1;
      }
      assert pos > -1 || posInc > 0;

      if (posInc > 0) {

        // New node:
        pos += posInc;

        posData = positions.get(pos);
        assert posData.leaving == -1;

        if (posData.arriving == -1) {
          // No token ever arrived to this position
          if (pos == 0) {
            // OK: this is the first token
            posData.leaving = 0;
          } else {
            // This means there's a hole (eg, StopFilter
            // does this):
            posData.leaving = builder.createState();
            addHoles(builder, positions, pos);
          }
        } else {
          posData.leaving = builder.createState();
          builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
          if (posInc > 1) {
            // A token spanned over a hole; add holes
            // "under" it:
            addHoles(builder, positions, pos);
          }
        }
        while (freedPos <= pos) {
          Position freePosData = positions.get(freedPos);
          // don't free this position yet if we may still need to fill holes over it:
          if (freePosData.arriving == -1 || freePosData.leaving == -1) {
            break;
          }
          positions.freeBefore(freedPos);
          freedPos++;
        }
      }

      final int endPos = pos + posLengthAtt.getPositionLength();

      final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
      int[] termUnicode = null;
      final Position endPosData = positions.get(endPos);
      if (endPosData.arriving == -1) {
        endPosData.arriving = builder.createState();
      }

      int termLen;
      if (unicodeArcs) {
        final String utf16 = termUTF8.utf8ToString();
        termUnicode = new int[utf16.codePointCount(0, utf16.length())];
        termLen = termUnicode.length;
        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
          termUnicode[j++] = cp = utf16.codePointAt(i);
        }
      } else {
        termLen = termUTF8.length;
      }

      int state = posData.leaving;

      for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
        final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
        int c;
        if (unicodeArcs) {
          c = termUnicode[byteIDX];
        } else {
          c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
        }
        builder.addTransition(state, nextState, c);
        state = nextState;
      }

      maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
    }

    in.end();

    int endPosInc = posIncAtt.getPositionIncrement();
    if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
      endPosInc = 1;
    } else if (endPosInc > 0 && preservePositionIncrements==false) {
      endPosInc = 0;
    }

    int endState;
    if (endPosInc > 0) {
      // there were hole(s) after the last token
      endState = builder.createState();

      // add trailing holes now:
      int lastState = endState;
      while (true) {
        int state1 = builder.createState();
        builder.addTransition(lastState, state1, HOLE);
        endPosInc--;
        if (endPosInc == 0) {
          builder.setAccept(state1, true);
          break;
        }
        int state2 = builder.createState();
        builder.addTransition(state1, state2, POS_SEP);
        lastState = state2;
      }
    } else {
      endState = -1;
    }

    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
      if (posData.arriving != -1) {
        if (endState != -1) {
          builder.addTransition(posData.arriving, endState, POS_SEP);
        } else {
          builder.setAccept(posData.arriving, true);
        }
      }
      pos++;
    }
    
    return builder.finish();
  }

  // for debugging!
  /*
  private static void toDot(Automaton a) throws IOException {
    final String s = a.toDot();
    Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
    w.write(s);
    w.close();
    System.out.println("TEST: saved to /tmp/out.dot");
  }
  */

  private static void addHoles(Automaton.Builder builder, RollingBuffer<Position> positions, int pos) {
    Position posData = positions.get(pos);
    Position prevPosData = positions.get(pos-1);

    while(posData.arriving == -1 || prevPosData.leaving == -1) {
      if (posData.arriving == -1) {
        posData.arriving = builder.createState();
        builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
      }
      if (prevPosData.leaving == -1) {
        if (pos == 1) {
          prevPosData.leaving = 0;
        } else {
          prevPosData.leaving = builder.createState();
        }
        if (prevPosData.arriving != -1) {
          builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
        }
      }
      builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
      pos--;
      if (pos <= 0) {
        break;
      }
      posData = prevPosData;
      prevPosData = positions.get(pos-1);
    }
  }
}
