blob: acbc9c37f1bf3f943c594982cb074480c507510e [file] [log] [blame]
package edu.illinois.ncsa.daffodil.processors
/* Copyright (c) 2012-2013 Tresys Technology, LLC. All rights reserved.
*
* Developed by: Tresys Technology, LLC
* http://www.tresys.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal with
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the names of Tresys Technology, nor the names of its contributors
* may be used to endorse or promote products derived from this Software
* without specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
* SOFTWARE.
*/
import java.util.regex.Pattern
import java.util.logging.Logging
import scala.util.control.Breaks
import java.util.regex.Matcher
import scala.collection.mutable.Queue
import edu.illinois.ncsa.daffodil.util.Enum
object DelimiterType extends Enum {
sealed abstract trait Type extends EnumValueType
case object Separator extends Type
case object Terminator extends Type
case object NotDelimited extends Type
}
object DelimiterLocation extends Enum {
sealed abstract trait Type extends EnumValueType
case object Local extends Type
case object Remote extends Type
}
class Delimiter {
var delimiterStr: String = "" // String representation of delimiter Ex. "%WSP;,%WSP*;"
var delimBuf: Array[DelimBase] = Array.empty[DelimBase] /* Buffer where each cell (DelimBase) represents a character
in the delimiter string */
var delimRegExParseDelim: String = "" // Regex to actually parse the entire delimiter
// Pre-compiled RegEx patterns for finding character classes
lazy val NL = Pattern.compile("%(NL);", Pattern.MULTILINE)
lazy val WSP = Pattern.compile("%(WSP);", Pattern.MULTILINE)
lazy val WSP_Plus = Pattern.compile("%(WSP\\+);", Pattern.MULTILINE)
lazy val WSP_Star = Pattern.compile("%(WSP\\*);", Pattern.MULTILINE)
override def toString(): String = {
return "Delimiter[" + delimiterStr + "]"
}
// Must call to create the necessary structures
//
def apply(pDelimiter: String) = {
delimiterStr = pDelimiter
delimBuf = buildDelimBuf(delimiterStr)
delimRegExParseDelim = this.delimRegexParseDelim(delimBuf)
}
// Reduces complicated delimiters containing consecutive WSP, WSP* and WSP+
// character classes.
//
// Ex. %WSP;%WSP*;%NL;%WSP+;%WSP*
// can be reduced to: %WSP+;%NL;%WSP+;
//
// TODO: Maybe should have an error message for the example. What did they mean?
// Problem because NL characters are in WSP. Possible to consume the expected NL
// and thus the rest of the delimiter may not match.
//
// Here we should note that %WSP;%WSP;%WSP; is NOT equivalent to %WSP+;
// as WSP+ would imply that %WSP;%WSP;%WSP;%WSP; is also valid when in fact
// it may not be.
//
def reduceDelimBuf(delims: Array[DelimBase]): Array[DelimBase] = {
val q: Queue[DelimBase] = new Queue[DelimBase]()
// Counters to keep track of WSP,+,* objects
var numWSP: Int = 0
var numWSP_Plus: Int = 0
var numWSP_Star: Int = 0
var idx: Int = 0 // To index the resultant array
delims.foreach(delim => {
delim match {
case wsp: WSPDelim => numWSP += 1
case wsp: WSPPlusDelim => numWSP_Plus += 1
case wsp: WSPStarDelim => numWSP_Star += 1
case _ => {
// We've reached a non WSP delimiter, check if we've
// previously encountered any WSP delimiter objects and
// return the equivalent representation (if any)
val result = getReducedDelim(numWSP, numWSP_Plus, numWSP_Star)
result match {
case Some(x) => {
// WSP exists and an equivalent representation was found
x.index = idx // Set the delimiter's index
q += x
idx += 1
}
case None => {
// Reduction not possible, but did we come across
// more than one WSP?
var i = 0
while (i < numWSP) {
val wsp = new WSPDelim
wsp.index = idx
q += wsp
idx += 1
i += 1
}
}
}
// Set the delimiter's index, needed to
// update the delimBuf individual node (DelimBase) state later
delim.index = idx
q += delim
idx += 1
// Reset counters
numWSP = 0
numWSP_Plus = 0
numWSP_Star = 0
}
}
}) // end-for-each
// Check for leftovers in case the delimiter
// ends in spaces
val result = getReducedDelim(numWSP, numWSP_Plus, numWSP_Star)
result match {
case Some(x) => {
x.index = idx
q += x
}
case None => {
// Reduction not possible, but did we come across
// more than one WSP?
var i = 0
while (i < numWSP) {
val wsp = new WSPDelim
wsp.index = idx
q += wsp
idx += 1
i += 1
}
}
}
q.toArray[DelimBase]
}
// Based upon what WSP delimiters were encountered,
// determine the equivalent representation (if any) and return it.
//
def getReducedDelim(numWSP: Int, numWSP_Plus: Int, numWSP_Star: Int): Option[DelimBase] = {
// TRUTH TABLE
// WSP WSP+ WSP* RESULT
// 1 0 0 0 NONE
// 2 0 0 1 WSP*
// 3 0 1 0 WSP+
// 4 0 1 1 WSP+
// 5 1 0 0 WSP
// 6 1 0 1 WSP+
// 7 1 1 0 WSP+
// 8 1 1 1 WSP+
if (numWSP_Plus != 0) {
// Case: 3, 4, 7, 8
return Some(new WSPPlusDelim())
} else if (numWSP != 0 && numWSP_Plus == 0 && numWSP_Star != 0) { // WSP+ == 0
// Case: 6
return Some(new WSPPlusDelim())
} else if (numWSP == 0 && numWSP_Plus == 0 && numWSP_Star != 0) {
// Case: 2
return Some(new WSPStarDelim())
} else if (numWSP == 1 && numWSP_Plus == 0 && numWSP_Star == 0) {
// Case: 5
return Some(new WSPDelim())
}
None
}
// Creates a RegEx representation of the delimiter.
// Important for comparing the actual delimiter against
// the data returned.
// Ex. separator = "%WSP*;,%WSP*;"
// delimiter retrieved from data: ", "
// There is no way that the separator text can equate to the data
// when character classes are involved, RegEx allows us to determine
// if the delimiter/data was in the expected format.
//
def delimRegexParseDelim(delimiterBuf: Array[DelimBase] = delimBuf): String = {
var sb: StringBuilder = new StringBuilder
delimiterBuf foreach {
delim =>
{
delim match {
case nl: NLDelim => {
sb.append("(?>" + // Eliminates needles backtracking. Atomic group of
"(\\r\\n)|" + // CRLF
"((?<!\\r)\\n)|" + // LF not preceded by CR
"(\\r(?!\\n))|" + // CR not followed by LF
"\\u0085|\\u2028)")
}
case wsp: WSPDelim => {
sb.append("(\\s|\\u0020|\\u0009|\\u000A|\\u000B|\\u000C|\\u000D|\\u0085" +
"|\\u00A0|\\u1680|\\u180E|\\u2000|\\u2001|\\u2002|\\u2003|\\u2004|\\u2005|\\u2006|" +
"\\u2007|\\u2008|\\u2009|\\u200A|\\u2028|\\u2029|\\u202F|\\u205F|\\u3000)")
} // Single space
case wsp: WSPPlusDelim => {
sb.append("(\\s|\\u0020|\\u0009|\\u000A|\\u000B|\\u000C|\\u000D|\\u0085" +
"|\\u00A0|\\u1680|\\u180E|\\u2000|\\u2001|\\u2002|\\u2003|\\u2004|\\u2005|\\u2006|" +
"\\u2007|\\u2008|\\u2009|\\u200A|\\u2028|\\u2029|\\u202F|\\u205F|\\u3000)+")
} // One or more spaces
case wsp: WSPStarDelim => {
sb.append("(\\s|\\u0020|\\u0009|\\u000A|\\u000B|\\u000C|\\u000D|\\u0085" +
"|\\u00A0|\\u1680|\\u180E|\\u2000|\\u2001|\\u2002|\\u2003|\\u2004|\\u2005|\\u2006|" +
"\\u2007|\\u2008|\\u2009|\\u200A|\\u2028|\\u2029|\\u202F|\\u205F|\\u3000)*")
} // None or more spaces
case char: CharDelim => { // Some character
char.char match {
case '[' => sb.append("\\[")
case '\\' => sb.append("\\\\")
case '^' => sb.append("\\^")
case '$' => sb.append("\\$")
case '.' => sb.append("\\.")
case '|' => sb.append("\\|")
case '?' => sb.append("\\?")
case '*' => sb.append("\\*")
case '+' => sb.append("\\+")
case '(' => sb.append("\\(")
case ')' => sb.append("\\)")
case '{' => sb.append("\\{")
case '}' => sb.append("\\}")
case x => sb.append(x)
}
}
}
}
}
sb.toString()
}
// Returns the first character class in the String
// or None if one is not found
//
def findCharClasses(str: String): (Int, Option[DelimBase]) = {
val mNL: Matcher = NL.matcher(str)
val mWSP: Matcher = WSP.matcher(str)
val mWSP_Plus: Matcher = WSP_Plus.matcher(str)
val mWSP_Star: Matcher = WSP_Star.matcher(str)
var length: Int = -1
val classList: scala.collection.mutable.Map[String, (Int, Int)] = scala.collection.mutable.Map.empty
if (mNL.find()) {
classList += ("NL" -> (mNL.start() -> mNL.end()))
}
if (mWSP.find()) {
classList += ("WSP" -> (mWSP.start() -> mWSP.end()))
}
if (mWSP_Plus.find()) {
classList += ("WSP+" -> (mWSP_Plus.start() -> mWSP_Plus.end()))
}
if (mWSP_Star.find()) {
classList += ("WSP*" -> (mWSP_Star.start() -> mWSP_Star.end()))
}
if (classList.size > 0) {
val minItem = classList.minBy(x => x._2._1)
length = minItem._2._2 - minItem._2._1
val result = minItem._1 match {
case "NL" => (length, Some(new NLDelim()))
case "WSP" => (length, Some(new WSPDelim()))
case "WSP+" => (length, Some(new WSPPlusDelim()))
case "WSP*" => (length, Some(new WSPStarDelim()))
}
return result
}
(-1, None) // Unrecognized CharClass
}
// Populates the delimBuf object with an object
// representation of the characters within the delimiter
// string.
//
def buildDelimBuf(delimStr: String): Array[DelimBase] = {
val q: Queue[DelimBase] = new Queue[DelimBase]()
var inc = 0
val loop = new Breaks
var newIdx = 0 // index within delimBuf array
var numCharClass: Int = 0
loop.breakable {
for (i <- 0 until delimStr.length()) {
val idx = i + inc // Advances cursor past the Character Class
if (idx >= delimStr.length()) {
// ran off end of delimiter string, break!
loop.break()
}
val c: Char = delimStr.charAt(idx)
if (c == '%') {
// Possible character class, check patterns
// According to JavaDoc, split will always return at least
// one result even if there is no match.
val split = delimStr.substring(idx + 1).split("%")
val subStr: String = "%" + split(0)
val (matchLength, delimObj) = findCharClasses(subStr)
if (matchLength != -1) {
// Have a match, add the object
val obj = delimObj.get
obj.index = newIdx // Index within delimBuf Array
q += obj
inc += matchLength - 1 // advance cursor past the Character Class
newIdx += 1
numCharClass += 1
} else {
// Not a CharClass or unrecognized,
// therefore treat as a CharDelim
val obj = new CharDelim(c)
obj.index = newIdx // Index within delimBuf Array
newIdx += 1
q += obj
}
} else {
// A CharDelim
val obj = new CharDelim(c)
obj.index = newIdx // Index within delimBuf Array
newIdx += 1
q += obj
}
} // END for-loop
} // END loop-breakable
var resDelimBuf: Array[DelimBase] = null
if (numCharClass > 1) {
// More than one Char Class, reduction possible!
resDelimBuf = reduceDelimBuf(q.toArray[DelimBase])
} else {
// No need to reduce
resDelimBuf = q.toArray[DelimBase]
}
resDelimBuf
}
}
abstract class DelimBase extends Base {
def typeName: String
def print
def printStr: String
override def toString(): String = {
return typeName
}
}
trait Base {
var isMatched: Boolean = false
var index: Int = -1
var charPos: Int = -1
var charPosEnd: Int = -1
def clear = {
isMatched = false
charPos = -1
charPosEnd = -1
}
def checkMatch(charIn: Char): Boolean
}
class CharDelim(val char: Char) extends DelimBase {
def checkMatch(charIn: Char): Boolean = {
val matched = charIn == char
matched
}
lazy val typeName = "CharDelim"
def print = {
//log(LogLevel.Debug, "\t\t\t" + typeName + ": '" + char + "' d" + char.toInt + " isMatched: " + isMatched.toString()))
}
def printStr = {
val res = typeName + "(" + char + ")"
res
}
override def toString(): String = {
return typeName + "[" + char + "]"
}
}
trait CharacterClass {
def convertUnicodeToChar(unicode: String): Char = {
val c: Char = Integer.parseInt(unicode.substring(2), 16).asInstanceOf[Char]
c
}
}
class NLDelim extends DelimBase with CharacterClass {
lazy val typeName = "NLDelim"
lazy val LF: Char = { convertUnicodeToChar("\\u000A") }
lazy val CR: Char = { convertUnicodeToChar("\\u000D") }
lazy val NEL: Char = { convertUnicodeToChar("\\u0085") }
lazy val LS: Char = { convertUnicodeToChar("\\u2028") }
def checkMatch(charIn: Char): Boolean = {
charIn match {
case LF | CR | NEL | LS => isMatched = true
case _ => isMatched = false
}
isMatched
}
def print = {
//log(LogLevel.Debug, "\t\t\t" + typeName + ": NL" + " isMatched: " + isMatched.toString()))
}
def printStr = {
val res = typeName
res
}
}
trait WSP extends CharacterClass {
lazy val CTRL0: Char = { convertUnicodeToChar("\\u0009") }
lazy val CTRL1: Char = { convertUnicodeToChar("\\u000A") }
lazy val CTRL2: Char = { convertUnicodeToChar("\\u000B") }
lazy val CTRL3: Char = { convertUnicodeToChar("\\u000C") }
lazy val CTRL4: Char = { convertUnicodeToChar("\\u000D") }
lazy val SPACE: Char = { convertUnicodeToChar("\\u0020") }
lazy val NEL: Char = { convertUnicodeToChar("\\u0085") }
lazy val NBSP: Char = { convertUnicodeToChar("\\u00A0") }
lazy val OGHAM: Char = { convertUnicodeToChar("\\u1680") }
lazy val MONG: Char = { convertUnicodeToChar("\\u180E") }
lazy val SP0: Char = { convertUnicodeToChar("\\u2000") }
lazy val SP1: Char = { convertUnicodeToChar("\\u2001") }
lazy val SP2: Char = { convertUnicodeToChar("\\u2002") }
lazy val SP3: Char = { convertUnicodeToChar("\\u2003") }
lazy val SP4: Char = { convertUnicodeToChar("\\u2004") }
lazy val SP5: Char = { convertUnicodeToChar("\\u2005") }
lazy val SP6: Char = { convertUnicodeToChar("\\u2006") }
lazy val SP7: Char = { convertUnicodeToChar("\\u2007") }
lazy val SP8: Char = { convertUnicodeToChar("\\u2008") }
lazy val SP9: Char = { convertUnicodeToChar("\\u2009") }
lazy val SP10: Char = { convertUnicodeToChar("\\u200A") }
lazy val LSP: Char = { convertUnicodeToChar("\\u2028") }
lazy val PSP: Char = { convertUnicodeToChar("\\u2029") }
lazy val NARROW: Char = { convertUnicodeToChar("\\u202F") }
lazy val MED: Char = { convertUnicodeToChar("\\u205F") }
lazy val IDE: Char = { convertUnicodeToChar("\\u3000") }
}
class WSPBase extends DelimBase with WSP {
lazy val typeName = "WSPBase"
def checkMatch(charIn: Char): Boolean = {
charIn match {
case CTRL0 | CTRL1 | CTRL2 | CTRL3 | CTRL4 => isMatched = true
case SPACE | NEL | NBSP | OGHAM | MONG => isMatched = true
case SP0 | SP1 | SP2 | SP3 | SP4 | SP5 | SP6 | SP7 | SP8 | SP9 | SP10 => isMatched = true
case LSP | PSP | NARROW | MED | IDE => isMatched = true
case _ => isMatched = false
}
isMatched
}
def print = {
//log(LogLevel.Debug, "\t\t\t" + typeName + ": WSPBase" + " isMatched: " + isMatched.toString()))
}
def printStr = {
val res = typeName
res
}
}
class WSPDelim extends WSPBase with WSP {
override lazy val typeName = "WSPDelim"
override def print = {
//log(LogLevel.Debug, "\t\t\t" + typeName + ": WSP" + " isMatched: " + isMatched.toString()))
}
override def printStr = {
val res = typeName
res
}
}
class WSPPlusDelim extends WSPBase with WSP {
override lazy val typeName = "WSP+Delim"
override def print = {
//log(LogLevel.Debug, "\t\t\t" + typeName + ": WSP+" + " isMatched: " + isMatched.toString()))
}
override def printStr = {
val res = typeName
res
}
}
class WSPStarDelim extends WSPBase with WSP {
override lazy val typeName = "WSP*Delim"
override def print = {
//log(LogLevel.Debug, "\t\t\t" + typeName + ": WSP*" + " isMatched: " + isMatched.toString()))
}
override def printStr = {
val res = typeName
res
}
}