compiler/src/main/java/org/apache/royale/compiler/internal/parsing/as/BaseRawASTokenizer.java - royale-compiler - Git at Google

 /*
  *
  *  Licensed to the Apache Software Foundation (ASF) under one or more
  *  contributor license agreements.  See the NOTICE file distributed with
  *  this work for additional information regarding copyright ownership.
  *  The ASF licenses this file to You under the Apache License, Version 2.0
  *  (the "License"); you may not use this file except in compliance with
  *  the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing, software
  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  *
  */

 package org.apache.royale.compiler.internal.parsing.as;

 import org.apache.royale.compiler.common.ISourceLocation;
 import org.apache.royale.compiler.problems.ASDocNotClosedProblem;
 import org.apache.royale.compiler.problems.CDataNotClosedProblem;
 import org.apache.royale.compiler.problems.CommentNotClosedProblem;
 import org.apache.royale.compiler.problems.ICompilerProblem;
 import org.apache.royale.compiler.problems.StringLiteralMustBeTerminatedBeforeLineBreakProblem;
 import org.apache.royale.compiler.problems.StringLiteralNotClosedProblem;

 /**
  * Base class for RawActionScriptTokenizer. Pulling out code into Java backing
  * class so it's easier to deal with. Uses a token pool to avoid object
  * creation. The token pool will keep a buffer of 10 tokens in memory, and it is
  * generally unsafe to hold on to tokens passed the point they are needed. If
  * tokens are to be preserved, their copy constructor should be used
  */
 public abstract class BaseRawASTokenizer extends BaseRawTokenizer<ASToken>
 {
     /**
      * Tracks if we are in a close tag. Used in the generated code
      */
     protected boolean isInCloseTag = false;

     /**
      * The depth of open tags, used to determine if we're still in E4X content.
      * Each time we see a tag open, this count is incremented. When we see any
      * kind of tag close (</ or />) we decrement the counter.
      */
     protected int e4xTagDepth = 0;

     /**
      * The depth of the braces in E4X content. Used to determine when to pop out
      * of E4X databindings and back to E4X content
      */
     protected int e4xBraceBalance = 0;

     /**
      * State to return to from E4X. Used in the generated code
      */
     protected int e4xReturnState = RawASTokenizer.E4X;

     /**
      * Depth of typed collection constructs. Used in the generated code
      */
     protected int typedDepth = 0;

 /**
      * Nested '<' bracket level (for <!DOCTYPE et al)
      */
     protected int docTypeLevel;

     /**
      * Flag to indicate we should collect comments
      */
     protected boolean collectComments = false;

     /**
      * token that we may need to return for rules that may return more than one
      * token
      */
     protected ASToken bufferToken;

     @Override
     protected void continueAggregate()
     {
         if (!hasAggregateContents())
             super.startAggregate();
         else
             super.continueAggregate();
     }

     /**
      * Convert the unicode escape sequence and append the unicode character to
      * text buffer. The input <b>must</b> have the leading escape character "\".
      *
      * @param escapeSequence Unicode escape sequence like {@code \u00FF} or
      * {@code \xFF}.
      */
     protected final void aggregateEscapedUnicodeChar(final String escapeSequence)
     {
         final int unicode = decodeEscapedUnicode(escapeSequence);
         //check to make sure we are in valid unicode range
         if (!Character.isValidCodePoint(unicode))
             addBadCharacterProblem(yytext());
         else
             continueAggregate(Character.toChars(unicode));
     }

     /**
      * Build a token from the aggregation buffer.
      *
      * @param type Type of the new token.
      * @return New token.
      */
     @Override
     public final ASToken buildAggregateToken(final int type)
     {
         final ASToken token = fetchToken(
                 type,
                 aggregateStart,
                 getOffset() + (markedPosition() - readStart()),
                 aggregateStartLine,
                 aggregateStartColumn,
                 super.aggregateContents);
         aggregateContents = null;
         setLastToken(token);
         return token;
     }

     /**
      * Build an e4x text token from a given entity type.
      */
     protected final ASToken buildE4XTextToken(final int type)
     {
         final ASToken token = buildToken(type);
         if (hasAggregateContents())
         {
             bufferToken = token;
             final int ttype;
             if (((RawASTokenizer)this).yystate() == RawASTokenizer.E4XTEXTVALUE)
                 ttype = ASTokenTypes.TOKEN_E4X_TEXT;
             else
                 ttype = ASTokenTypes.TOKEN_E4X_STRING;
             return buildAggregateToken(ttype);
         }
         else
         {
             aggregateContents = null;
             return token;
         }
     }

     @Override
     protected final void fillBuffer(StringBuilder builder)
     {
         builder.append(buffer(), readStart(), markedPosition() - readStart());
     }

     /**
      * returns the start of the current read
      *
      * @return a non-negative int
      */
     protected abstract int readStart();

     /**
      * returns the marked position in the current read
      *
      * @return a non-negative int
      */
     protected abstract int markedPosition();

     /**
      * returns the end of the current read
      *
      * @return a non-negative int
      */
     protected abstract int readEnd();

     /**
      * returns the current active buffer. this will change and is not constant
      *
      * @return the current buffer
      */
     protected abstract char[] buffer();

     /**
      * True if we want to collect comments, besides ASDoc. ASDoc comments will
      * always be returned
      *
      * @param collect <code>true</code to collect non-ASDoc comments,
      * <code>false</code> to ignore them.
      */
     public void setCollectComments(final boolean collect)
     {
         collectComments = collect;
     }

     /**
      * in E4X, <code>&lt;</code> is allowed in char and string literals.
      *
      * @param allow true if <code>&lt;</code> is allowed
      */
     protected abstract void setAllowLTInE4XStringLiterals(boolean allow);

     @Override
     protected ASToken[] initTokenPool()
     {
         return new ASToken[10];
     }

     /**
      * @return true if we are still parsing XML content
      */
     public boolean isInXML()
     {
         return e4xTagDepth > 0 && e4xBraceBalance == 0;
     }

     /**
      * @return true if we are parsing inside of an E4x databinding expression
      */
     public boolean isInE4XDatabinding()
     {
         return e4xBraceBalance > 0;
     }

     /**
      * begins the current state in the lexer
      *
      * @param state the state to begin
      */
     protected abstract void yybegin(int state);

     /**
      * returns a char at the given offset into the internal char buffer
      *
      * @param pos the offset into the buffer
      * @return a char
      */
     protected abstract char yycharat(int pos);

     /**
      * returns the length of the current read
      *
      * @return a non-negative int
      */
     protected abstract int yylength();

     @Override
     protected final ASToken newToken(final int type, final int start, final int end, final int line, final int column, final CharSequence text)
     {
         return new ASToken(type, start, end, line, column, text);
     }

     /**
      * @return true if we are collecting comments
      */
     public final boolean isCollectingComments()
     {
         return collectComments;
     }

     public final ASToken getBufferToken()
     {
         final ASToken retVal = bufferToken.clone();
         bufferToken = null;
         return retVal;
     }

     public final boolean hasBufferToken()
     {
         return bufferToken != null;
     }

     @Override
     public void reset()
     {
         super.reset();
         e4xBraceBalance = 0;
         e4xTagDepth = 0;
     }

     /**
      * Matches escaped unicode sequence like {@code \u00FF}.
      */
     static final String PATTERN_U4 = "\\\\u[a-fA-F0-9]{4}";

     /**
      * Matches escaped unicode sequence like {@code \xFF}.
      */
     private static final String PATTERN_X2 = "\\\\x[a-fA-F0-9]{2}";

     /**
      * Matches either {@link #PATTERN_U4} or {@link #PATTERN_X2}.
      */
     private static final String PATTERN_UNICODE = String.format("(%s)|(%s)", PATTERN_U4, PATTERN_X2);

     /**
      * Convert escaped unicode sequence such as {@code \u00FF} and {@code \xFF}
      * to HTML entities like {@code &#xFF}.
      *
      * @param escapedUnicode Escaped unicode sequence in the form of either
      * {@code \u0000} or {@code \xFF}.
      * @return Encoded HTML entity string.
      */
     protected String escapedUnicodeToHtmlEntity(final String escapedUnicode)
     {
         final int unicode = decodeEscapedUnicode(escapedUnicode);
         return String.format("&#x%H;", unicode);
     }

     /**
      * Report unexpected line terminators in a string literal.
      */
     protected final void reportInvalidLineTerminatorInStringLiteral()
     {
         final ISourceLocation location = getCurrentSourceLocation(0);
         final ICompilerProblem problem = new StringLiteralMustBeTerminatedBeforeLineBreakProblem(location);
         getProblems().add(problem);
     }

     /**
      * Report syntax error: input ended before reaching the closing quotation
      * mark for a string literal.
      */
     protected final void reportUnclosedStringLiteral()
     {
         final ISourceLocation location = getCurrentSourceLocation(0);
         final ICompilerProblem problem = new StringLiteralNotClosedProblem(location);
         getProblems().add(problem);
     }

     /**
      * Report syntax error: input ended before ASDoc is closed.
      */
     protected final void reportUnclosedASDoc()
     {
         final ISourceLocation location = getCurrentSourceLocation(0);
         final ICompilerProblem problem = new ASDocNotClosedProblem(location);
         getProblems().add(problem);
     }

     /**
      * Report syntax error: input ended before Comment is closed.
      */
     protected final void reportUnclosedComment()
     {
         final ISourceLocation location = getCurrentSourceLocation(0);
         final ICompilerProblem problem = new CommentNotClosedProblem(location);
         getProblems().add(problem);
     }

     /**
      * Report syntax error: input ended before CDATA is closed.
      */
     protected final void reportUnclosedCDATA()
     {
         final ISourceLocation location = getCurrentSourceLocation(0);
         final ICompilerProblem problem = new CDataNotClosedProblem(location);
         getProblems().add(problem);
     }

     /**
      * Convert escaped unicode sequence such as {@code \u00FF} and {@code \xFF}
      * to unicode code point.
      *
      * @param escapedUnicode Escaped unicode sequence in the form of either
      * {@code \u0000} or {@code \xFF}.
      * @return Unicode number.
      */
     protected static int decodeEscapedUnicode(final String escapedUnicode)
     {
         if (escapedUnicode == null)
             throw new IllegalArgumentException("Escape sequence can't be null");

         if (!escapedUnicode.matches(PATTERN_UNICODE))
             throw new IllegalStateException("Only call this method from a lexer rule that matches unicode sequence pattern.");

         return Integer.parseInt(escapedUnicode.substring(2), 16);
     }

 }
	/*
	*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*/

	package org.apache.royale.compiler.internal.parsing.as;

	import org.apache.royale.compiler.common.ISourceLocation;
	import org.apache.royale.compiler.problems.ASDocNotClosedProblem;
	import org.apache.royale.compiler.problems.CDataNotClosedProblem;
	import org.apache.royale.compiler.problems.CommentNotClosedProblem;
	import org.apache.royale.compiler.problems.ICompilerProblem;
	import org.apache.royale.compiler.problems.StringLiteralMustBeTerminatedBeforeLineBreakProblem;
	import org.apache.royale.compiler.problems.StringLiteralNotClosedProblem;

	/**
	* Base class for RawActionScriptTokenizer. Pulling out code into Java backing
	* class so it's easier to deal with. Uses a token pool to avoid object
	* creation. The token pool will keep a buffer of 10 tokens in memory, and it is
	* generally unsafe to hold on to tokens passed the point they are needed. If
	* tokens are to be preserved, their copy constructor should be used
	*/
	public abstract class BaseRawASTokenizer extends BaseRawTokenizer<ASToken>
	{
	/**
	* Tracks if we are in a close tag. Used in the generated code
	*/
	protected boolean isInCloseTag = false;

	/**
	* The depth of open tags, used to determine if we're still in E4X content.
	* Each time we see a tag open, this count is incremented. When we see any
	* kind of tag close (</ or />) we decrement the counter.
	*/
	protected int e4xTagDepth = 0;

	/**
	* The depth of the braces in E4X content. Used to determine when to pop out
	* of E4X databindings and back to E4X content
	*/
	protected int e4xBraceBalance = 0;

	/**
	* State to return to from E4X. Used in the generated code
	*/
	protected int e4xReturnState = RawASTokenizer.E4X;

	/**
	* Depth of typed collection constructs. Used in the generated code
	*/
	protected int typedDepth = 0;

	/**
	* Nested '<' bracket level (for <!DOCTYPE et al)
	*/
	protected int docTypeLevel;

	/**
	* Flag to indicate we should collect comments
	*/
	protected boolean collectComments = false;

	/**
	* token that we may need to return for rules that may return more than one
	* token
	*/
	protected ASToken bufferToken;

	@Override
	protected void continueAggregate()
	{
	if (!hasAggregateContents())
	super.startAggregate();
	else
	super.continueAggregate();
	}

	/**
	* Convert the unicode escape sequence and append the unicode character to
	* text buffer. The input <b>must</b> have the leading escape character "\".
	*
	* @param escapeSequence Unicode escape sequence like {@code \u00FF} or
	* {@code \xFF}.
	*/
	protected final void aggregateEscapedUnicodeChar(final String escapeSequence)
	{
	final int unicode = decodeEscapedUnicode(escapeSequence);
	//check to make sure we are in valid unicode range
	if (!Character.isValidCodePoint(unicode))
	addBadCharacterProblem(yytext());
	else
	continueAggregate(Character.toChars(unicode));
	}

	/**
	* Build a token from the aggregation buffer.
	*
	* @param type Type of the new token.
	* @return New token.
	*/
	@Override
	public final ASToken buildAggregateToken(final int type)
	{
	final ASToken token = fetchToken(
	type,
	aggregateStart,
	getOffset() + (markedPosition() - readStart()),
	aggregateStartLine,
	aggregateStartColumn,
	super.aggregateContents);
	aggregateContents = null;
	setLastToken(token);
	return token;
	}

	/**
	* Build an e4x text token from a given entity type.
	*/
	protected final ASToken buildE4XTextToken(final int type)
	{
	final ASToken token = buildToken(type);
	if (hasAggregateContents())
	{
	bufferToken = token;
	final int ttype;
	if (((RawASTokenizer)this).yystate() == RawASTokenizer.E4XTEXTVALUE)
	ttype = ASTokenTypes.TOKEN_E4X_TEXT;
	else
	ttype = ASTokenTypes.TOKEN_E4X_STRING;
	return buildAggregateToken(ttype);
	}
	else
	{
	aggregateContents = null;
	return token;
	}
	}

	@Override
	protected final void fillBuffer(StringBuilder builder)
	{
	builder.append(buffer(), readStart(), markedPosition() - readStart());
	}

	/**
	* returns the start of the current read
	*
	* @return a non-negative int
	*/
	protected abstract int readStart();

	/**
	* returns the marked position in the current read
	*
	* @return a non-negative int
	*/
	protected abstract int markedPosition();

	/**
	* returns the end of the current read
	*
	* @return a non-negative int
	*/
	protected abstract int readEnd();

	/**
	* returns the current active buffer. this will change and is not constant
	*
	* @return the current buffer
	*/
	protected abstract char[] buffer();

	/**
	* True if we want to collect comments, besides ASDoc. ASDoc comments will
	* always be returned
	*
	* @param collect <code>true</code to collect non-ASDoc comments,
	* <code>false</code> to ignore them.
	*/
	public void setCollectComments(final boolean collect)
	{
	collectComments = collect;
	}

	/**
	* in E4X, <code><</code> is allowed in char and string literals.
	*
	* @param allow true if <code><</code> is allowed
	*/
	protected abstract void setAllowLTInE4XStringLiterals(boolean allow);

	@Override
	protected ASToken[] initTokenPool()
	{
	return new ASToken[10];
	}

	/**
	* @return true if we are still parsing XML content
	*/
	public boolean isInXML()
	{
	return e4xTagDepth > 0 && e4xBraceBalance == 0;
	}

	/**
	* @return true if we are parsing inside of an E4x databinding expression
	*/
	public boolean isInE4XDatabinding()
	{
	return e4xBraceBalance > 0;
	}

	/**
	* begins the current state in the lexer
	*
	* @param state the state to begin
	*/
	protected abstract void yybegin(int state);

	/**
	* returns a char at the given offset into the internal char buffer
	*
	* @param pos the offset into the buffer
	* @return a char
	*/
	protected abstract char yycharat(int pos);

	/**
	* returns the length of the current read
	*
	* @return a non-negative int
	*/
	protected abstract int yylength();

	@Override
	protected final ASToken newToken(final int type, final int start, final int end, final int line, final int column, final CharSequence text)
	{
	return new ASToken(type, start, end, line, column, text);
	}

	/**
	* @return true if we are collecting comments
	*/
	public final boolean isCollectingComments()
	{
	return collectComments;
	}

	public final ASToken getBufferToken()
	{
	final ASToken retVal = bufferToken.clone();
	bufferToken = null;
	return retVal;
	}

	public final boolean hasBufferToken()
	{
	return bufferToken != null;
	}

	@Override
	public void reset()
	{
	super.reset();
	e4xBraceBalance = 0;
	e4xTagDepth = 0;
	}

	/**
	* Matches escaped unicode sequence like {@code \u00FF}.
	*/
	static final String PATTERN_U4 = "\\\\u[a-fA-F0-9]{4}";

	/**
	* Matches escaped unicode sequence like {@code \xFF}.
	*/
	private static final String PATTERN_X2 = "\\\\x[a-fA-F0-9]{2}";

	/**
	* Matches either {@link #PATTERN_U4} or {@link #PATTERN_X2}.
	*/
	private static final String PATTERN_UNICODE = String.format("(%s)\|(%s)", PATTERN_U4, PATTERN_X2);

	/**
	* Convert escaped unicode sequence such as {@code \u00FF} and {@code \xFF}
	* to HTML entities like {@code &#xFF}.
	*
	* @param escapedUnicode Escaped unicode sequence in the form of either
	* {@code \u0000} or {@code \xFF}.
	* @return Encoded HTML entity string.
	*/
	protected String escapedUnicodeToHtmlEntity(final String escapedUnicode)
	{
	final int unicode = decodeEscapedUnicode(escapedUnicode);
	return String.format("&#x%H;", unicode);
	}

	/**
	* Report unexpected line terminators in a string literal.
	*/
	protected final void reportInvalidLineTerminatorInStringLiteral()
	{
	final ISourceLocation location = getCurrentSourceLocation(0);
	final ICompilerProblem problem = new StringLiteralMustBeTerminatedBeforeLineBreakProblem(location);
	getProblems().add(problem);
	}

	/**
	* Report syntax error: input ended before reaching the closing quotation
	* mark for a string literal.
	*/
	protected final void reportUnclosedStringLiteral()
	{
	final ISourceLocation location = getCurrentSourceLocation(0);
	final ICompilerProblem problem = new StringLiteralNotClosedProblem(location);
	getProblems().add(problem);
	}

	/**
	* Report syntax error: input ended before ASDoc is closed.
	*/
	protected final void reportUnclosedASDoc()
	{
	final ISourceLocation location = getCurrentSourceLocation(0);
	final ICompilerProblem problem = new ASDocNotClosedProblem(location);
	getProblems().add(problem);
	}

	/**
	* Report syntax error: input ended before Comment is closed.
	*/
	protected final void reportUnclosedComment()
	{
	final ISourceLocation location = getCurrentSourceLocation(0);
	final ICompilerProblem problem = new CommentNotClosedProblem(location);
	getProblems().add(problem);
	}

	/**
	* Report syntax error: input ended before CDATA is closed.
	*/
	protected final void reportUnclosedCDATA()
	{
	final ISourceLocation location = getCurrentSourceLocation(0);
	final ICompilerProblem problem = new CDataNotClosedProblem(location);
	getProblems().add(problem);
	}

	/**
	* Convert escaped unicode sequence such as {@code \u00FF} and {@code \xFF}
	* to unicode code point.
	*
	* @param escapedUnicode Escaped unicode sequence in the form of either
	* {@code \u0000} or {@code \xFF}.
	* @return Unicode number.
	*/
	protected static int decodeEscapedUnicode(final String escapedUnicode)
	{
	if (escapedUnicode == null)
	throw new IllegalArgumentException("Escape sequence can't be null");

	if (!escapedUnicode.matches(PATTERN_UNICODE))
	throw new IllegalStateException("Only call this method from a lexer rule that matches unicode sequence pattern.");

	return Integer.parseInt(escapedUnicode.substring(2), 16);
	}

	}