/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.uima.fit.testing.factory;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;

import java.io.File;
import java.io.FileInputStream;
import java.util.Collection;
import java.util.Iterator;

import org.apache.commons.io.IOUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.ComponentTestBase;
import org.apache.uima.fit.type.Sentence;
import org.apache.uima.fit.type.Token;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.pear.util.FileUtil;
import org.junit.Test;

/**
 */

public class TokenBuilderTest extends ComponentTestBase {

  @Test
  public void test1() {
    String text = "What if we built a rocket ship made of cheese?"
            + "We could fly it to the moon for repairs.";
    tokenBuilder
            .buildTokens(
                    jCas,
                    text,
                    "What if we built a rocket ship made of cheese ? \r\n We could fly it to the moon for repairs .",
                    "A B C D E F G H I J K L M N O P Q R S T U");

    FSIndex<Annotation> sentenceIndex = jCas.getAnnotationIndex(Sentence.type);
    assertEquals(2, sentenceIndex.size());
    FSIterator<Annotation> sentences = sentenceIndex.iterator();
    Sentence sentence = (Sentence) sentences.next();
    assertEquals("What if we built a rocket ship made of cheese?", sentence.getCoveredText());
    sentence = (Sentence) sentences.next();
    assertEquals("We could fly it to the moon for repairs.", sentence.getCoveredText());

    FSIndex<Annotation> tokenIndex = jCas.getAnnotationIndex(Token.type);
    assertEquals(21, tokenIndex.size());
    Token token = JCasUtil.selectByIndex(jCas, Token.class, 0);
    testToken(token, "What", 0, 4, "A", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 1);
    testToken(token, "if", 5, 7, "B", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 9);
    testToken(token, "cheese", 39, 45, "J", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 10);
    testToken(token, "?", 45, 46, "K", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 11);
    testToken(token, "We", 46, 48, "L", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 12);
    testToken(token, "could", 49, 54, "M", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 19);
    testToken(token, "repairs", 78, 85, "T", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 20);
    testToken(token, ".", 85, 86, "U", null);
  }

  @Test
  public void test2() {
    String text = "What if we built a rocket ship made of cheese? \n"
            + "We could fly it to the moon for repairs.";
    tokenBuilder
            .buildTokens(
                    jCas,
                    text,
                    "What if we built a rocket ship made of cheese ? \n We could fly it to the moon for repairs .",
                    "A B C D E F G H I J K L M N O P Q R S T U");

    Token token = JCasUtil.selectByIndex(jCas, Token.class, 10);
    testToken(token, "?", 45, 46, "K", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 11);
    testToken(token, "We", 48, 50, "L", null);

    jCas.reset();
    text = "What if we built a rocket ship made of cheese? \n"
            + "We could fly it to the moon for repairs.";
    tokenBuilder
            .buildTokens(
                    jCas,
                    text,
                    "What if we built a rocket ship made of cheese ?\nWe could fly it to the moon for repairs .",
                    "A B C D E F G H I J K L M N O P Q R S T U");

    token = JCasUtil.selectByIndex(jCas, Token.class, 10);
    testToken(token, "?", 45, 46, "K", null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 11);
    testToken(token, "We", 48, 50, "L", null);
  }

  @Test
  public void test3() {
    String text = "If you like line writer, then you should really check out line rider.";
    tokenBuilder.buildTokens(jCas, text);

    FSIndex<Annotation> tokenIndex = jCas.getAnnotationIndex(Token.type);
    assertEquals(13, tokenIndex.size());
    Token token = JCasUtil.selectByIndex(jCas, Token.class, 0);
    testToken(token, "If", 0, 2, null, null);
    token = JCasUtil.selectByIndex(jCas, Token.class, 12);
    testToken(token, "rider.", 63, 69, null, null);
    FSIndex<Annotation> sentenceIndex = jCas.getAnnotationIndex(Sentence.type);
    assertEquals(1, sentenceIndex.size());
    Sentence sentence = JCasUtil.selectByIndex(jCas, Sentence.class, 0);
    assertEquals(text, sentence.getCoveredText());
  }

  private void testToken(Token token, String coveredText, int begin, int end, String partOfSpeech,
          String stem) {
    assertEquals(coveredText, token.getCoveredText());
    assertEquals(begin, token.getBegin());
    assertEquals(end, token.getEnd());
    assertEquals(partOfSpeech, token.getPos());
    assertEquals(stem, token.getStem());
  }

  @Test
  public void testSpaceSplit() {
    String[] splits = " asdf ".split(" ");
    assertEquals(2, splits.length);
  }

  @Test
  public void testBadInput() {
    String text = "If you like line writer, then you should really check out line rider.";
    IllegalArgumentException iae = null;
    try {
      tokenBuilder.buildTokens(jCas, text,
              "If you like line rider, then you really don't need line writer");
    } catch (IllegalArgumentException e) {
      iae = e;
    }
    assertNotNull(iae);
  }

  @Test
  public void testStems() {
    String text = "Me and all my friends are non-conformists.";
    tokenBuilder.buildTokens(jCas, text, "Me and all my friends are non - conformists .",
            "M A A M F A N - C .", "me and all my friend are non - conformist .");

    assertEquals("Me and all my friends are non-conformists.", jCas.getDocumentText());
    Token friendToken = JCasUtil.selectByIndex(jCas, Token.class, 4);
    assertEquals("friends", friendToken.getCoveredText());
    assertEquals("F", friendToken.getPos());
    assertEquals("friend", friendToken.getStem());
  }

  @Test
  public void test4() {
    String text = "a b-c de--fg h,i,j,k";
    tokenBuilder.buildTokens(jCas, text, "a b - c d e - - f g h , i , j , k");

    FSIterator<Annotation> tokens = jCas.getAnnotationIndex(Token.type).iterator();
    int tokenCount = 0;
    while (tokens.hasNext()) {
      tokenCount++;
      tokens.next();
    }
    assertEquals(17, tokenCount);
  }

  @Test
  public void test5() throws Exception {
    JCas myView = jCas.createView("MyView");

    tokenBuilder.buildTokens(myView, "red and blue cars and tipsy motorcycles");

    Token token = JCasUtil.selectByIndex(myView, Token.class, 6);
    assertEquals("motorcycles", token.getCoveredText());

  }

  @Test
  public void testNewlinesFromFile() throws Exception {
    File unixNewlines = new File("src/test/resources/data/docs/unix-newlines.txt.bin");
    assertEquals(55, unixNewlines.length());
    byte[] unixNewlinesBytes = IOUtils.toByteArray(new FileInputStream(unixNewlines));
    assertEquals('.', unixNewlinesBytes[13]);
    assertEquals(0x0A, unixNewlinesBytes[14]);
    assertEquals('s', unixNewlinesBytes[15]);
    
    String text = FileUtil.loadTextFile(unixNewlines, "UTF-8");
    text = text.substring(1); // remove "\uFEFF" character from beginning of text
    tokenBuilder.buildTokens(jCas, text);

    Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
    assertEquals(4, sentences.size());
    Iterator<Sentence> iterator = sentences.iterator();
    assertEquals("sentence 1.", iterator.next().getCoveredText());
    assertEquals("sentence 2.", iterator.next().getCoveredText());
    assertEquals("sentence 3.", iterator.next().getCoveredText());
    assertEquals("sentence 4.", iterator.next().getCoveredText());

    jCas.reset();
    File windowsNewlines = new File("src/test/resources/data/docs/windows-newlines.txt.bin");
    text = FileUtil.loadTextFile(windowsNewlines, "UTF-8");
    assertEquals(65, windowsNewlines.length());
    byte[] windowsNewlinesBytes = IOUtils.toByteArray(new FileInputStream(windowsNewlines));
    assertEquals('.', windowsNewlinesBytes[13]);
    assertEquals(0x0D, windowsNewlinesBytes[14]);
    assertEquals(0x0A, windowsNewlinesBytes[15]);
    assertEquals('s', windowsNewlinesBytes[16]);
    text = text.substring(1); // remove "\uFEFF" character from beginning of text
    tokenBuilder.buildTokens(jCas, text);

    sentences = JCasUtil.select(jCas, Sentence.class);
    assertEquals(4, sentences.size());
    iterator = sentences.iterator();
    assertEquals("sentence 1.", iterator.next().getCoveredText());
    assertEquals("sentence 2.", iterator.next().getCoveredText());
    assertEquals("sentence 3.", iterator.next().getCoveredText());
    assertEquals("sentence 4.", iterator.next().getCoveredText());
  }
}
