lang/java/avro/src/test/java/org/apache/avro/util/CaseFinder.java - avro - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.avro.util;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
  * Parser for files containing test cases consisting of
  * <code>&lt;String,String&gt;</code> pairs, where the first string is the input
  * to the test case, and the second string is the expected output of the test
  * case.
  *
  * <p>
  * A test-case file is a sequence of
  * <a href="en.wikipedia.org/wiki/Here_document">here documents</a>
  * ("heredocs"), very similar in syntax to Unix Shell heredocs. Heredocs labeled
  * "INPUT" indicate the start of a new case, and these INPUT heredocs the inputs
  * of test cases. Following an "INPUT" heredoc can more zero or more
  * "expected-output" heredocs. Each of these expected-output heredocs defines
  * what we call a <dfn>subcase</dfn>. The assumption here is that for each
  * interesting test input, there are often multiple different tests one could
  * run, each with different expected outputs.
  *
  * <p>
  * Consumers of this class call the {@link #find} method to find all subcases
  * marked with a given label. For example, imagine the following test-case file:
  * <blockquote>
  *
  * <pre>
  *    &lt;&lt;INPUT 0
  *    &lt;&lt;VALUE 0
  *    &lt;&lt;PPRINT 0
  *    &lt;&lt;INPUT 1+1
  *    &lt;&lt;VALUE 2
  *    &lt;&lt;PPRINT 1 + 1
  *    &lt;&lt;SEXP (+ 1 1)
  *    SEXP
  * </pre>
  *
  * </blockquote> Calling {@link #find} on the label "VALUE" will return two test
  * cases, the pair <code>&lt;"0","0"&gt;</code> and
  * <code>&lt;"1+1","2"&gt;</code>. Calling it on the label "PPRINT" will return
  * <code>&lt;"0","0"&gt;</code> and <code>&lt;"1+1","1 +
  * 1"&gt;</code>. Notice that there need not be a subcase for every INPUT. In
  * the case of "SEXP", for example, {@link #find} will return only the single
  * pair <code>&lt;"1+1","(+ 1 1)"&gt;</code>.
  *
  * <p>
  * There are two forms of heredocs, single-line and multi-line. The examples
  * above (except "SEXP") are single-line heredocs. The general syntax for these
  * is: <blockquote>
  *
  * <pre>
  * ^&lt;&lt;([a-zA-Z][_a-zA-Z0-9]*) (.*)$
  * </pre>
  *
  * </blockquote> The first group in this regex is the label of the heredoc, and
  * the second group is the text of the heredoc. A single space separates the two
  * groups and is not part of there heredoc (subsequent spaces <em>will</em> be
  * included in the heredoc). A "line terminator" as defined by the Java language
  * (i.e., CR, LR, or CR followed by LF) terminates a singline-line heredoc but
  * is not included in the text of the heredoc.
  *
  * <p>
  * As the name implies, multi-line heredocs are spread across multiple lines, as
  * in this example: <blockquote>
  *
  * <pre>
  *    &lt;&lt;INPUT
  *    1
  *    +1 +
  *    1
  *    INPUT
  *    &lt;&lt;VALUE 3
  *    &lt;&lt;PPRINT 1 + 1 + 1
  * </pre>
  *
  * </blockquote> In this case, the input to the test case is spread across
  * multiple lines (the line terminators in these documents are preserved as part
  * of the document text). Multi-line heredocs can be used for both the inputs of
  * text cases and the expected outputs of them.
  *
  * <p>
  * The syntax of multi-line heredocs obey the following pseudo-regex:
  * <blockquote>
  *
  * <pre>
  * ^&lt;&lt;([a-zA-Z][_a-zA-Z0-9]*)$(.*)$^\1$
  * </pre>
  *
  * </blockquote> That is, as illustrated by the example, a multi-line heredoc
  * named "LABEL" consists of the text <code>&lt;lt;LABEL</code> on a line by
  * itself, followed by the text of the heredoc, followed by the text
  * <code>LABEL</code> on a line by itself (if LABEL starts a line but is not the
  * <em>only</em> text on that line, then that entire line is part of the
  * heredoc, and the heredoc is not terminated by that line).
  *
  * <p>
  * In multi-line heredocs, neither the line terminator that terminates the start
  * of the document, nor the one just before the label that ends the heredoc, are
  * part of the text of the heredoc. Thus, for example, the text of the
  * multi-line input from above would be exactly <code>"1\n+1&nbsp;+\n1"</code>.
  * If you want a new line at the end of a multi-line heredoc, put a blank line
  * before the label ending the heredoc.
  *
  * <p>
  * Also in multi-line heredocs, line-terminators within the heredoc are
  * normalized to line-feeds ('\n'). Thus, for example, when a test file written
  * on a Windows machine is parsed on any machine, the Windows-style line
  * terminators within heredocs will be translated to Unix-style line
  * terminators, no matter what platform the tests are run on.
  *
  * <p>
  * Note that lines between heredocs are ignored, and can be used to provide
  * spacing between and/or commentary on the test cases.
  */
 public class CaseFinder {
   /**
    * Scan test-case file <code>in</code> looking for test subcases marked with
    * <code>caseLabel</code>. Any such cases are appended (in order) to the "cases"
    * parameter. If <code>caseLabel</code> equals the string <code>"INPUT"</code>,
    * then returns the list of &lt;<i>input</i>, <code>null</code>&gt; pairs for
    * <i>input</i> equal to all heredoc's named INPUT's found in the input stream.
    */
   public static List<Object[]> find(BufferedReader in, String label, List<Object[]> cases) throws IOException {
     if (!Pattern.matches(LABEL_REGEX, label))
       throw new IllegalArgumentException("Bad case subcase label: " + label);

     final String subcaseMarker = "<<" + label;

     for (String line = in.readLine();;) {
       // Find next new case
       while (line != null && !line.startsWith(NEW_CASE_MARKER))
         line = in.readLine();
       if (line == null)
         break;
       String input;
       input = processHereDoc(in, line);

       if (label.equals(NEW_CASE_NAME)) {
         cases.add(new Object[] { input, null });
         line = in.readLine();
         continue;
       }

       // Check to see if there's a subcase named "label" for that case
       do {
         line = in.readLine();
       } while (line != null && (!line.startsWith(NEW_CASE_MARKER) && !line.startsWith(subcaseMarker)));
       if (line == null || line.startsWith(NEW_CASE_MARKER))
         continue;
       String expectedOutput = processHereDoc(in, line);

       cases.add(new Object[] { input, expectedOutput });
     }
     in.close();
     return cases;
   }

   private static final String NEW_CASE_NAME = "INPUT";
   private static final String NEW_CASE_MARKER = "<<" + NEW_CASE_NAME;
   private static final String LABEL_REGEX = "[a-zA-Z][_a-zA-Z0-9]*";
   private static final Pattern START_LINE_PATTERN = Pattern.compile("^<<(" + LABEL_REGEX + ")(.*)$");

   /**
    * Reads and returns content of a heredoc. Assumes we just read a
    * start-of-here-doc marker for a here-doc labeled "docMarker." Replaces
    * arbitrary newlines with system newlines, but strips newline from final line
    * of heredoc. Throws IOException if EOF is reached before heredoc is terminate.
    */
   private static String processHereDoc(BufferedReader in, String docStart) throws IOException {
     Matcher m = START_LINE_PATTERN.matcher(docStart);
     if (!m.matches())
       throw new IllegalArgumentException("Wasn't given the start of a heredoc (\"" + docStart + "\")");
     String docName = m.group(1);

     // Determine if this is a single-line heredoc, and process if it is
     String singleLineText = m.group(2);
     if (singleLineText.length() != 0) {
       if (!singleLineText.startsWith(" "))
         throw new IOException("Single-line heredoc missing initial space (\"" + docStart + "\")");
       return singleLineText.substring(1);
     }

     // Process multi-line heredocs
     StringBuilder result = new StringBuilder();
     String line = in.readLine();
     String prevLine = "";
     boolean firstTime = true;
     while (line != null && !line.equals(docName)) {
       if (!firstTime)
         result.append(prevLine).append('\n');
       else
         firstTime = false;
       prevLine = line;
       line = in.readLine();
     }
     if (line == null)
       throw new IOException("Here document (" + docName + ") terminated by end-of-file.");
     return result.append(prevLine).toString();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.avro.util;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* Parser for files containing test cases consisting of
	* <code><String,String></code> pairs, where the first string is the input
	* to the test case, and the second string is the expected output of the test
	* case.
	*
	* <p>
	* A test-case file is a sequence of
	* <a href="en.wikipedia.org/wiki/Here_document">here documents</a>
	* ("heredocs"), very similar in syntax to Unix Shell heredocs. Heredocs labeled
	* "INPUT" indicate the start of a new case, and these INPUT heredocs the inputs
	* of test cases. Following an "INPUT" heredoc can more zero or more
	* "expected-output" heredocs. Each of these expected-output heredocs defines
	* what we call a <dfn>subcase</dfn>. The assumption here is that for each
	* interesting test input, there are often multiple different tests one could
	* run, each with different expected outputs.
	*
	* <p>
	* Consumers of this class call the {@link #find} method to find all subcases
	* marked with a given label. For example, imagine the following test-case file:
	* <blockquote>
	*
	* <pre>
	* <<INPUT 0
	* <<VALUE 0
	* <<PPRINT 0
	* <<INPUT 1+1
	* <<VALUE 2
	* <<PPRINT 1 + 1
	* <<SEXP (+ 1 1)
	* SEXP
	* </pre>
	*
	* </blockquote> Calling {@link #find} on the label "VALUE" will return two test
	* cases, the pair <code><"0","0"></code> and
	* <code><"1+1","2"></code>. Calling it on the label "PPRINT" will return
	* <code><"0","0"></code> and <code><"1+1","1 +
	* 1"></code>. Notice that there need not be a subcase for every INPUT. In
	* the case of "SEXP", for example, {@link #find} will return only the single
	* pair <code><"1+1","(+ 1 1)"></code>.
	*
	* <p>
	* There are two forms of heredocs, single-line and multi-line. The examples
	* above (except "SEXP") are single-line heredocs. The general syntax for these
	* is: <blockquote>
	*
	* <pre>
	* ^<<([a-zA-Z][_a-zA-Z0-9]) (.)$
	* </pre>
	*
	* </blockquote> The first group in this regex is the label of the heredoc, and
	* the second group is the text of the heredoc. A single space separates the two
	* groups and is not part of there heredoc (subsequent spaces <em>will</em> be
	* included in the heredoc). A "line terminator" as defined by the Java language
	* (i.e., CR, LR, or CR followed by LF) terminates a singline-line heredoc but
	* is not included in the text of the heredoc.
	*
	* <p>
	* As the name implies, multi-line heredocs are spread across multiple lines, as
	* in this example: <blockquote>
	*
	* <pre>
	* <<INPUT
	* 1
	* +1 +
	* 1
	* INPUT
	* <<VALUE 3
	* <<PPRINT 1 + 1 + 1
	* </pre>
	*
	* </blockquote> In this case, the input to the test case is spread across
	* multiple lines (the line terminators in these documents are preserved as part
	* of the document text). Multi-line heredocs can be used for both the inputs of
	* text cases and the expected outputs of them.
	*
	* <p>
	* The syntax of multi-line heredocs obey the following pseudo-regex:
	* <blockquote>
	*
	* <pre>
	* ^<<([a-zA-Z][_a-zA-Z0-9])$(.)$^\1$
	* </pre>
	*
	* </blockquote> That is, as illustrated by the example, a multi-line heredoc
	* named "LABEL" consists of the text <code><lt;LABEL</code> on a line by
	* itself, followed by the text of the heredoc, followed by the text
	* <code>LABEL</code> on a line by itself (if LABEL starts a line but is not the
	* <em>only</em> text on that line, then that entire line is part of the
	* heredoc, and the heredoc is not terminated by that line).
	*
	* <p>
	* In multi-line heredocs, neither the line terminator that terminates the start
	* of the document, nor the one just before the label that ends the heredoc, are
	* part of the text of the heredoc. Thus, for example, the text of the
	* multi-line input from above would be exactly <code>"1\n+1 +\n1"</code>.
	* If you want a new line at the end of a multi-line heredoc, put a blank line
	* before the label ending the heredoc.
	*
	* <p>
	* Also in multi-line heredocs, line-terminators within the heredoc are
	* normalized to line-feeds ('\n'). Thus, for example, when a test file written
	* on a Windows machine is parsed on any machine, the Windows-style line
	* terminators within heredocs will be translated to Unix-style line
	* terminators, no matter what platform the tests are run on.
	*
	* <p>
	* Note that lines between heredocs are ignored, and can be used to provide
	* spacing between and/or commentary on the test cases.
	*/
	public class CaseFinder {
	/**
	* Scan test-case file <code>in</code> looking for test subcases marked with
	* <code>caseLabel</code>. Any such cases are appended (in order) to the "cases"
	* parameter. If <code>caseLabel</code> equals the string <code>"INPUT"</code>,
	* then returns the list of <<i>input</i>, <code>null</code>> pairs for
	* <i>input</i> equal to all heredoc's named INPUT's found in the input stream.
	*/
	public static List<Object[]> find(BufferedReader in, String label, List<Object[]> cases) throws IOException {
	if (!Pattern.matches(LABEL_REGEX, label))
	throw new IllegalArgumentException("Bad case subcase label: " + label);

	final String subcaseMarker = "<<" + label;

	for (String line = in.readLine();;) {
	// Find next new case
	while (line != null && !line.startsWith(NEW_CASE_MARKER))
	line = in.readLine();
	if (line == null)
	break;
	String input;
	input = processHereDoc(in, line);

	if (label.equals(NEW_CASE_NAME)) {
	cases.add(new Object[] { input, null });
	line = in.readLine();
	continue;
	}

	// Check to see if there's a subcase named "label" for that case
	do {
	line = in.readLine();
	} while (line != null && (!line.startsWith(NEW_CASE_MARKER) && !line.startsWith(subcaseMarker)));
	if (line == null \|\| line.startsWith(NEW_CASE_MARKER))
	continue;
	String expectedOutput = processHereDoc(in, line);

	cases.add(new Object[] { input, expectedOutput });
	}
	in.close();
	return cases;
	}

	private static final String NEW_CASE_NAME = "INPUT";
	private static final String NEW_CASE_MARKER = "<<" + NEW_CASE_NAME;
	private static final String LABEL_REGEX = "[a-zA-Z][_a-zA-Z0-9]*";
	private static final Pattern START_LINE_PATTERN = Pattern.compile("^<<(" + LABEL_REGEX + ")(.*)$");

	/**
	* Reads and returns content of a heredoc. Assumes we just read a
	* start-of-here-doc marker for a here-doc labeled "docMarker." Replaces
	* arbitrary newlines with system newlines, but strips newline from final line
	* of heredoc. Throws IOException if EOF is reached before heredoc is terminate.
	*/
	private static String processHereDoc(BufferedReader in, String docStart) throws IOException {
	Matcher m = START_LINE_PATTERN.matcher(docStart);
	if (!m.matches())
	throw new IllegalArgumentException("Wasn't given the start of a heredoc (\"" + docStart + "\")");
	String docName = m.group(1);

	// Determine if this is a single-line heredoc, and process if it is
	String singleLineText = m.group(2);
	if (singleLineText.length() != 0) {
	if (!singleLineText.startsWith(" "))
	throw new IOException("Single-line heredoc missing initial space (\"" + docStart + "\")");
	return singleLineText.substring(1);
	}

	// Process multi-line heredocs
	StringBuilder result = new StringBuilder();
	String line = in.readLine();
	String prevLine = "";
	boolean firstTime = true;
	while (line != null && !line.equals(docName)) {
	if (!firstTime)
	result.append(prevLine).append('\n');
	else
	firstTime = false;
	prevLine = line;
	line = in.readLine();
	}
	if (line == null)
	throw new IOException("Here document (" + docName + ") terminated by end-of-file.");
	return result.append(prevLine).toString();
	}
	}