lang/java/avro/src/test/java/org/apache/avro/util/CaseFinder.java - avro - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.avro.util;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.List;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;

 /** Parser for files containing test cases consisting of
  * <code>&lt;String,String&gt;</code> pairs, where the first string is
  * the input to the test case, and the second string is the expected
  * output of the test case.
  *
  * <p> A test-case file is a sequence of <a
  * href="en.wikipedia.org/wiki/Here_document">here documents</a>
  * ("heredocs"), very similar in syntax to Unix Shell heredocs.
  * Heredocs labeled "INPUT" indicate the start of a new case, and
  * these INPUT heredocs the inputs of test cases.  Following an
  * "INPUT" heredoc can more zero or more "expected-output" heredocs.
  * Each of these expected-output heredocs defines what we call a
  * <dfn>subcase</dfn>.  The assumption here is that for each
  * interesting test input, there are often multiple different tests
  * one could run, each with different expected outputs.
  *
  * <p> Consumers of this class call the {@link #find} method to find
  * all subcases marked with a given label.  For example, imagine the
  * following test-case file:
  * <blockquote> <pre>
  *    &lt;&lt;INPUT 0
  *    &lt;&lt;VALUE 0
  *    &lt;&lt;PPRINT 0
  *    &lt;&lt;INPUT 1+1
  *    &lt;&lt;VALUE 2
  *    &lt;&lt;PPRINT 1 + 1
  *    &lt;&lt;SEXP (+ 1 1)
  *    SEXP
  * </pre> </blockquote>
  * Calling {@link #find} on the label "VALUE" will return two test
  * cases, the pair <code>&lt;"0","0"&gt;</code> and
  * <code>&lt;"1+1","2"&gt;</code>.  Calling it on the label "PPRINT"
  * will return <code>&lt;"0","0"&gt;</code> and <code>&lt;"1+1","1 +
  * 1"&gt;</code>.  Notice that there need not be a subcase for every
  * INPUT.  In the case of "SEXP", for example, {@link #find} will
  * return only the single pair <code>&lt;"1+1","(+ 1 1)"&gt;</code>.
  *
  * <p> There are two forms of heredocs, single-line and multi-line.
  * The examples above (except "SEXP") are single-line heredocs.  The
  * general syntax for these is:
  * <blockquote> <pre>
  * ^&lt;&lt;([a-zA-Z][_a-zA-Z0-9]*) (.*)$
  * </pre> </blockquote>
  * The first group in this regex is the label of the heredoc, and the
  * second group is the text of the heredoc.  A single space separates
  * the two groups and is not part of there heredoc (subsequent spaces
  * <em>will</em> be included in the heredoc).  A "line terminator" as
  * defined by the Java language (i.e., CR, LR, or CR followed by LF)
  * terminates a singline-line heredoc but is not included in the text
  * of the heredoc.
  *
  * <p> As the name implies, multi-line heredocs are spread across
  * multiple lines, as in this example:
  * <blockquote> <pre>
  *    &lt;&lt;INPUT
  *    1
  *    +1 +
  *    1
  *    INPUT
  *    &lt;&lt;VALUE 3
  *    &lt;&lt;PPRINT 1 + 1 + 1
  * </pre> </blockquote>
  * In this case, the input to the test case is spread across multiple
  * lines (the line terminators in these documents are preserved as
  * part of the document text).  Multi-line heredocs can be used for
  * both the inputs of text cases and the expected outputs of them.

  * <p> The syntax of multi-line heredocs obey the following pseudo-regex:
  * <blockquote> <pre>
  * ^&lt;&lt;([a-zA-Z][_a-zA-Z0-9]*)$(.*)$^\1$
  * </pre> </blockquote>
  * That is, as illustrated by the example, a multi-line heredoc named
  * "LABEL" consists of the text <code>&lt;lt;LABEL</code> on a line by
  * itself, followed by the text of the heredoc, followed by the text
  * <code>LABEL</code> on a line by itself (if LABEL starts a line but
  * is not the <em>only</em> text on that line, then that entire line
  * is part of the heredoc, and the heredoc is not terminated by that
  * line).
  *
  * <p>In multi-line heredocs, neither the line terminator that
  * terminates the start of the document, nor the one just before the
  * label that ends the heredoc, are part of the text of the heredoc.
  * Thus, for example, the text of the multi-line input from above
  * would be exactly <code>"1\n+1&nbsp;+\n1"</code>.  If you want a new
  * line at the end of a multi-line heredoc, put a blank line before
  * the label ending the heredoc.
  *
  * <p>Also in multi-line heredocs, line-terminators within the heredoc
  * are normalized to line-feeds ('\n').  Thus, for example, when a
  * test file written on a Windows machine is parsed on any machine,
  * the Windows-style line terminators within heredocs will be
  * translated to Unix-style line terminators, no matter what platform
  * the tests are run on.
  *
  * <p> Note that lines between heredocs are ignored, and can be used
  * to provide spacing between and/or commentary on the test cases.
  */
 public class CaseFinder {
   /** Scan test-case file <code>in</code> looking for test subcases
     * marked with <code>caseLabel</code>.  Any such cases are appended
     * (in order) to the "cases" parameter.  If <code>caseLabel</code>
     * equals the string <code>"INPUT"</code>, then returns the list of
     * &lt;<i>input</i>, <code>null</code>&gt; pairs for <i>input</i>
     * equal to all heredoc's named INPUT's found in the input
     * stream. */
   public static List<Object[]> find(BufferedReader in, String label,
                                     List<Object[]> cases)
     throws IOException
   {
     if (! Pattern.matches(LABEL_REGEX, label))
       throw new IllegalArgumentException("Bad case subcase label: " + label);

     final String subcaseMarker = "<<" + label;

     for (String line = in.readLine();;) {
       // Find next new case
       while (line != null && !line.startsWith(NEW_CASE_MARKER))
         line = in.readLine();
       if (line == null) break;
       String input;
       input = processHereDoc(in, line);

       if (label.equals(NEW_CASE_NAME)) {
         cases.add(new Object[] { input, null });
         line = in.readLine();
         continue;
       }

       // Check to see if there's a subcase named "label" for that case
       do {
         line = in.readLine();
       } while (line != null && (!line.startsWith(NEW_CASE_MARKER)
                                 && !line.startsWith(subcaseMarker)));
       if (line == null || line.startsWith(NEW_CASE_MARKER)) continue;
       String expectedOutput = processHereDoc(in, line);

       cases.add(new Object[] { input, expectedOutput });
     }
     in.close();
     return cases;
   }

   private static final String NEW_CASE_NAME = "INPUT";
   private static final String NEW_CASE_MARKER = "<<"+NEW_CASE_NAME;
   private static final String LABEL_REGEX = "[a-zA-Z][_a-zA-Z0-9]*";
   private static final Pattern START_LINE_PATTERN
     = Pattern.compile("^<<("+LABEL_REGEX+")(.*)$");

   /** Reads and returns content of a heredoc.  Assumes we just read a
     * start-of-here-doc marker for a here-doc labeled "docMarker."
     * Replaces arbitrary newlines with sytem newlines, but strips
     * newline from final line of heredoc.  Throws IOException if EOF
     * is reached before heredoc is terminate. */
   private static String processHereDoc(BufferedReader in, String docStart)
     throws IOException
   {
     Matcher m = START_LINE_PATTERN.matcher(docStart);
     if (! m.matches())
       throw new IllegalArgumentException("Wasn't given the start of a heredoc (\""+docStart+"\")");
     String docName = m.group(1);

     // Determine if this is a single-line heredoc, and process if it is
     String singleLineText = m.group(2);
     if (singleLineText.length() != 0) {
       if (! singleLineText.startsWith(" "))
         throw new IOException("Single-line heredoc missing initial space (\""+docStart+"\")");
       return singleLineText.substring(1);
     }

     // Process multi-line heredocs
     StringBuilder result = new StringBuilder();
     String line = in.readLine();
     String prevLine = "";
     boolean firstTime = true;
     while (line != null && !line.equals(docName)) {
       if (! firstTime) result.append(prevLine).append('\n');
       else firstTime = false;
       prevLine = line;
       line = in.readLine();
     }
     if (line == null)
       throw new IOException("Here document (" + docName
                             + ") terminated by end-of-file.");
     return result.append(prevLine).toString();
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.avro.util;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.util.List;
	import java.util.regex.Pattern;
	import java.util.regex.Matcher;

	/** Parser for files containing test cases consisting of
	* <code><String,String></code> pairs, where the first string is
	* the input to the test case, and the second string is the expected
	* output of the test case.
	*
	* <p> A test-case file is a sequence of <a
	* href="en.wikipedia.org/wiki/Here_document">here documents</a>
	* ("heredocs"), very similar in syntax to Unix Shell heredocs.
	* Heredocs labeled "INPUT" indicate the start of a new case, and
	* these INPUT heredocs the inputs of test cases. Following an
	* "INPUT" heredoc can more zero or more "expected-output" heredocs.
	* Each of these expected-output heredocs defines what we call a
	* <dfn>subcase</dfn>. The assumption here is that for each
	* interesting test input, there are often multiple different tests
	* one could run, each with different expected outputs.
	*
	* <p> Consumers of this class call the {@link #find} method to find
	* all subcases marked with a given label. For example, imagine the
	* following test-case file:
	* <blockquote> <pre>
	* <<INPUT 0
	* <<VALUE 0
	* <<PPRINT 0
	* <<INPUT 1+1
	* <<VALUE 2
	* <<PPRINT 1 + 1
	* <<SEXP (+ 1 1)
	* SEXP
	* </pre> </blockquote>
	* Calling {@link #find} on the label "VALUE" will return two test
	* cases, the pair <code><"0","0"></code> and
	* <code><"1+1","2"></code>. Calling it on the label "PPRINT"
	* will return <code><"0","0"></code> and <code><"1+1","1 +
	* 1"></code>. Notice that there need not be a subcase for every
	* INPUT. In the case of "SEXP", for example, {@link #find} will
	* return only the single pair <code><"1+1","(+ 1 1)"></code>.
	*
	* <p> There are two forms of heredocs, single-line and multi-line.
	* The examples above (except "SEXP") are single-line heredocs. The
	* general syntax for these is:
	* <blockquote> <pre>
	* ^<<([a-zA-Z][_a-zA-Z0-9]) (.)$
	* </pre> </blockquote>
	* The first group in this regex is the label of the heredoc, and the
	* second group is the text of the heredoc. A single space separates
	* the two groups and is not part of there heredoc (subsequent spaces
	* <em>will</em> be included in the heredoc). A "line terminator" as
	* defined by the Java language (i.e., CR, LR, or CR followed by LF)
	* terminates a singline-line heredoc but is not included in the text
	* of the heredoc.
	*
	* <p> As the name implies, multi-line heredocs are spread across
	* multiple lines, as in this example:
	* <blockquote> <pre>
	* <<INPUT
	* 1
	* +1 +
	* 1
	* INPUT
	* <<VALUE 3
	* <<PPRINT 1 + 1 + 1
	* </pre> </blockquote>
	* In this case, the input to the test case is spread across multiple
	* lines (the line terminators in these documents are preserved as
	* part of the document text). Multi-line heredocs can be used for
	* both the inputs of text cases and the expected outputs of them.

	* <p> The syntax of multi-line heredocs obey the following pseudo-regex:
	* <blockquote> <pre>
	* ^<<([a-zA-Z][_a-zA-Z0-9])$(.)$^\1$
	* </pre> </blockquote>
	* That is, as illustrated by the example, a multi-line heredoc named
	* "LABEL" consists of the text <code><lt;LABEL</code> on a line by
	* itself, followed by the text of the heredoc, followed by the text
	* <code>LABEL</code> on a line by itself (if LABEL starts a line but
	* is not the <em>only</em> text on that line, then that entire line
	* is part of the heredoc, and the heredoc is not terminated by that
	* line).
	*
	* <p>In multi-line heredocs, neither the line terminator that
	* terminates the start of the document, nor the one just before the
	* label that ends the heredoc, are part of the text of the heredoc.
	* Thus, for example, the text of the multi-line input from above
	* would be exactly <code>"1\n+1 +\n1"</code>. If you want a new
	* line at the end of a multi-line heredoc, put a blank line before
	* the label ending the heredoc.
	*
	* <p>Also in multi-line heredocs, line-terminators within the heredoc
	* are normalized to line-feeds ('\n'). Thus, for example, when a
	* test file written on a Windows machine is parsed on any machine,
	* the Windows-style line terminators within heredocs will be
	* translated to Unix-style line terminators, no matter what platform
	* the tests are run on.
	*
	* <p> Note that lines between heredocs are ignored, and can be used
	* to provide spacing between and/or commentary on the test cases.
	*/
	public class CaseFinder {
	/** Scan test-case file <code>in</code> looking for test subcases
	* marked with <code>caseLabel</code>. Any such cases are appended
	* (in order) to the "cases" parameter. If <code>caseLabel</code>
	* equals the string <code>"INPUT"</code>, then returns the list of
	* <<i>input</i>, <code>null</code>> pairs for <i>input</i>
	* equal to all heredoc's named INPUT's found in the input
	* stream. */
	public static List<Object[]> find(BufferedReader in, String label,
	List<Object[]> cases)
	throws IOException
	{
	if (! Pattern.matches(LABEL_REGEX, label))
	throw new IllegalArgumentException("Bad case subcase label: " + label);

	final String subcaseMarker = "<<" + label;

	for (String line = in.readLine();;) {
	// Find next new case
	while (line != null && !line.startsWith(NEW_CASE_MARKER))
	line = in.readLine();
	if (line == null) break;
	String input;
	input = processHereDoc(in, line);

	if (label.equals(NEW_CASE_NAME)) {
	cases.add(new Object[] { input, null });
	line = in.readLine();
	continue;
	}

	// Check to see if there's a subcase named "label" for that case
	do {
	line = in.readLine();
	} while (line != null && (!line.startsWith(NEW_CASE_MARKER)
	&& !line.startsWith(subcaseMarker)));
	if (line == null \|\| line.startsWith(NEW_CASE_MARKER)) continue;
	String expectedOutput = processHereDoc(in, line);

	cases.add(new Object[] { input, expectedOutput });
	}
	in.close();
	return cases;
	}

	private static final String NEW_CASE_NAME = "INPUT";
	private static final String NEW_CASE_MARKER = "<<"+NEW_CASE_NAME;
	private static final String LABEL_REGEX = "[a-zA-Z][_a-zA-Z0-9]*";
	private static final Pattern START_LINE_PATTERN
	= Pattern.compile("^<<("+LABEL_REGEX+")(.*)$");

	/** Reads and returns content of a heredoc. Assumes we just read a
	* start-of-here-doc marker for a here-doc labeled "docMarker."
	* Replaces arbitrary newlines with sytem newlines, but strips
	* newline from final line of heredoc. Throws IOException if EOF
	* is reached before heredoc is terminate. */
	private static String processHereDoc(BufferedReader in, String docStart)
	throws IOException
	{
	Matcher m = START_LINE_PATTERN.matcher(docStart);
	if (! m.matches())
	throw new IllegalArgumentException("Wasn't given the start of a heredoc (\""+docStart+"\")");
	String docName = m.group(1);

	// Determine if this is a single-line heredoc, and process if it is
	String singleLineText = m.group(2);
	if (singleLineText.length() != 0) {
	if (! singleLineText.startsWith(" "))
	throw new IOException("Single-line heredoc missing initial space (\""+docStart+"\")");
	return singleLineText.substring(1);
	}

	// Process multi-line heredocs
	StringBuilder result = new StringBuilder();
	String line = in.readLine();
	String prevLine = "";
	boolean firstTime = true;
	while (line != null && !line.equals(docName)) {
	if (! firstTime) result.append(prevLine).append('\n');
	else firstTime = false;
	prevLine = line;
	line = in.readLine();
	}
	if (line == null)
	throw new IOException("Here document (" + docName
	+ ") terminated by end-of-file.");
	return result.append(prevLine).toString();
	}
	}