| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * https://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nlpcraft.model.impl |
| |
| import java.text.SimpleDateFormat |
| import java.util |
| import java.util.{List => JList} |
| |
| import com.typesafe.scalalogging.LazyLogging |
| import org.apache.nlpcraft.common._ |
| import org.apache.nlpcraft.common.ascii._ |
| import org.apache.nlpcraft.common.nlp._ |
| import org.apache.nlpcraft.model.NCToken |
| import org.apache.nlpcraft.model.impl.NCTokenPimp._ |
| import org.apache.nlpcraft.common.ansi.NCAnsi._ |
| |
| import scala.collection.mutable |
| import scala.jdk.CollectionConverters.{CollectionHasAsScala, MapHasAsScala} |
| |
| /** |
| * Utility service that provides supporting functionality for ASCII rendering. |
| */ |
| //noinspection DuplicatedCode |
| object NCTokenLogger extends LazyLogging { |
| case class NoteMetadata(noteType: String, filtered: Seq[String], isFull: Boolean) |
| |
| // Order and sorting of notes for ASCII output. |
| private final val NOTE_TYPES = Seq[String]( |
| "nlpcraft:nlp", |
| "nlpcraft:continent", |
| "nlpcraft:subcontinent", |
| "nlpcraft:country", |
| "nlpcraft:metro", |
| "nlpcraft:region", |
| "nlpcraft:city", |
| "nlpcraft:date", |
| "nlpcraft:num", |
| "nlpcraft:relation", |
| "nlpcraft:sort", |
| "nlpcraft:limit", |
| "nlpcraft:coordinate" |
| ) |
| |
| // Filters for notes types. If filter is not set all columns will display. |
| private final val NOTE_COLUMNS = Map[String, Seq[String]]( |
| "nlpcraft:nlp" -> |
| Seq( |
| "index", |
| "origText", |
| "lemma", |
| "pos", |
| "quoted", |
| "stopWord", |
| "freeword", |
| "dict", |
| "wordIndexes", |
| "direct", |
| "sparsity" |
| ) |
| ) |
| |
| private final val SORT: Map[String, Map[String, Int]] = |
| Map( |
| "nlpcraft:continent" -> Seq("continent"), |
| "nlpcraft:subcontinent" -> Seq("subcontinent", "continent"), |
| "nlpcraft:country" -> Seq("country", "subcontinent", "continent"), |
| "nlpcraft:metro" -> Seq("metro"), |
| "nlpcraft:region" -> Seq("region", "country", "subcontinent", "continent", "metro"), |
| "nlpcraft:city" -> Seq("city", "latitude", "longitude", "region", "country", "subcontinent", "continent"), |
| "nlpcraft:date" -> Seq("from", "to", "periods"), |
| "nlpcraft:relation" -> Seq("type", "indexes", "note"), |
| "nlpcraft:sort" -> Seq("asc", "subjnotes", "subjindexes", "bynotes", "byindexes"), |
| "nlpcraft:limit" -> Seq("limit", "indexes", "asc", "note") |
| ).map(p => p._1 -> p._2.zipWithIndex.map(p => p._1 -> p._2).toMap) |
| |
| private def format(l: Long): String = new SimpleDateFormat("yyyy/MM/dd").format(new java.util.Date(l)) |
| private def mkMore(incl: Boolean): String = if (incl) ">=" else ">" |
| private def mkLess(incl: Boolean): String = if (incl) "<=" else "<" |
| |
| /** |
| * Filters and sorts keys pairs to visually group notes logically. |
| * |
| * @param pairs Sequence of note key/note value key pairs. |
| */ |
| private def filterKeysPairs(pairs: Seq[(String, String)]): Seq[NoteMetadata] = { |
| val seq = |
| pairs.map(_._1).distinct.map(p => p -> pairs.filter(_._1 == p).map(_._2)). |
| sortBy(p => { |
| val idx = NOTE_TYPES.indexWhere(_ == p._1) |
| |
| if (idx >= 0) idx else Integer.MAX_VALUE |
| }) |
| |
| seq.map(s => { |
| val t = s._1 |
| |
| val (filtered, isFull) = |
| if (t.startsWith("nlpcraft:")) |
| NOTE_COLUMNS.get(t) match { |
| case Some(fs) => (s._2.filter(fs.contains).sortBy(p => fs.indexWhere(_ == p)), false) |
| case None => (Seq.empty[String], true) |
| } |
| else |
| (Seq.empty[String], true) |
| |
| NoteMetadata(t, filtered, isFull) |
| }) |
| } |
| |
| /** |
| * Normalize header. |
| * |
| * @param h Header. |
| */ |
| private def normalizeHeader(h: String): String = if (h.startsWith("nlpcraft:")) h.replaceAll("nlpcraft:", "") else h |
| |
| /** |
| * |
| * @param md Notes. |
| */ |
| private def mkTable(md: Seq[NoteMetadata]): NCAsciiTable = |
| NCAsciiTable(md.flatMap(h => |
| if (h.isFull) |
| Seq(normalizeHeader(h.noteType)) |
| else |
| h.filtered.map(p => s"${normalizeHeader(h.noteType)}:${p.toLowerCase}") |
| ): _*) |
| |
| private def note2String(note: NCNlpSentenceNote): String = { |
| val sorted: Seq[(String, java.io.Serializable)] = |
| SORT.get(note.noteType) match { |
| case Some(map) => note.toSeq.sortBy(p => map.getOrElse(p._1, Int.MaxValue)) |
| case None => note.toSeq |
| } |
| |
| def vals2String(seq: Seq[(String, java.io.Serializable)]): String = { |
| def getValue(name: String): java.io.Serializable = { |
| val found = seq.find(_._1 == name) |
| |
| // Fail-fast in case of programmatic errors. |
| require(found.isDefined, s"Invalid note value: $name") |
| |
| found.get._2 |
| } |
| |
| def mkValue(name: String, fractionalField: String): String = { |
| val d = getValue(name).asInstanceOf[Double] |
| |
| if (getValue(fractionalField).asInstanceOf[Boolean]) d.toString else d.toInt.toString |
| } |
| def mkBool(name: String): Boolean = getValue(name).asInstanceOf[Boolean] |
| def mkBoolOpt(name: String): Option[Boolean] = getValueOpt(name) match { |
| case Some(b) => Some(b.asInstanceOf[Boolean]) |
| case None => None |
| } |
| def mkString(name: String): String = getValue(name).toString |
| def mkJListString(name: String): String = getValue(name).asInstanceOf[JList[String]].asScala.mkString(",") |
| def mkDate(name: String): String = format(getValue(name).asInstanceOf[Long]) |
| |
| def getValueOpt(name: String): Option[java.io.Serializable] = |
| seq.find(_._1 == name) match { |
| case Some(x) => Some(x._2) |
| case None => None |
| } |
| |
| def mkStringOpt(name: String): Option[String] = |
| getValueOpt(name) match { |
| case Some(jv) => Some(jv.toString) |
| case None => None |
| } |
| |
| def mkDouble3(name: String): Double = (getValue(name).asInstanceOf[Double] * 1000).intValue / 1000.0 |
| def indexes2String(v: java.io.Serializable): String = v.asInstanceOf[util.List[Int]].asScala.mkString(",") |
| def mkIndexes(name: String): String = indexes2String(getValue(name)) |
| def getSeq(names: String*): String = names.flatMap(name => mkStringOpt(name)).mkString("|") |
| |
| note.noteType match { |
| case "nlpcraft:continent" => getSeq("continent") |
| case "nlpcraft:subcontinent" => getSeq("continent", "subcontinent") |
| case "nlpcraft:country" => getSeq("continent", "subcontinent", "country") |
| case "nlpcraft:region" => getSeq("continent", "subcontinent", "country", "region") |
| case "nlpcraft:city" => getSeq("continent", "subcontinent", "country", "region", "city") |
| case "nlpcraft:metro" => getSeq("metro") |
| case "nlpcraft:date" => |
| val from = mkDate("from") |
| val to = mkDate("to") |
| val ps = mkJListString("periods") |
| |
| val r = s"$from:$to" |
| |
| s"range=$r, periods=$ps" |
| |
| case "nlpcraft:relation" => |
| val t = mkString("type") |
| val note = mkString("note") |
| |
| s"type=$t, indexes=[${mkIndexes("indexes")}], note=$note" |
| |
| case "nlpcraft:sort" => |
| var s = mkStringOpt("subjnotes") match { |
| case Some(subjnotes) => s"subjnotes=$subjnotes, subjindexes=${mkIndexes("subjindexes")}" |
| case None => "" |
| } |
| |
| mkStringOpt("bynotes") match { |
| case Some(bynotes) => |
| val sBy = s"bynotes=$bynotes, byindexes=${mkIndexes("byindexes")}" |
| |
| s = if (s.nonEmpty) s"$s, $sBy" else sBy |
| case None => // No-op. |
| } |
| |
| val ascOpt = mkBoolOpt("asc") |
| |
| if (ascOpt.isDefined) |
| s = s"$s, asc=${ascOpt.get}" |
| |
| s |
| |
| case "nlpcraft:limit" => |
| val limit = mkDouble3("limit") |
| val ascOpt = mkBoolOpt("asc") |
| val note = mkString("note") |
| |
| var s = s"limit=$limit, indexes=[${mkIndexes("indexes")}], note=$note" |
| |
| if (ascOpt.isDefined) |
| s = s"$s, asc=${ascOpt.get}" |
| |
| s |
| |
| case "nlpcraft:coordinate" => s"${getValue("latitude")} and ${getValue("longitude")}" |
| |
| case "nlpcraft:num" => |
| val from = mkValue("from", "isFractional") |
| val to = mkValue("to", "isFractional") |
| val fromIncl = mkBool("fromIncl") |
| val toIncl = mkBool("toIncl") |
| val isRangeCond = mkBool("isRangeCondition") |
| val isEqCond = mkBool("isEqualCondition") |
| val isNotEqCond = mkBool("isNotEqualCondition") |
| val isFromNegInf = mkBool("isFromNegativeInfinity") |
| val isToPosInf = mkBool("isToPositiveInfinity") |
| |
| val x1 = if (isFromNegInf) "-Infinity" else from |
| val x2 = if (isToPosInf) "+Infinity" else to |
| |
| var s = |
| if (isRangeCond) |
| s"${mkMore(fromIncl)}$x1 && ${mkLess(toIncl)}$x2" |
| else if (isEqCond) |
| s"=$x1" |
| else { |
| assert(isNotEqCond) |
| |
| s"!=$x1" |
| } |
| |
| s = getValueOpt("unit") match { |
| case Some(u) => s"$s, unit=$u(${getValue("unitType")})" |
| case None => s |
| } |
| |
| s |
| |
| case name if name.startsWith("google:") => |
| val meta = |
| getValue("meta"). |
| asInstanceOf[java.util.Map[String, java.io.Serializable]]. |
| asScala.map(p => s"${p._1}=${p._2}").mkString(",") |
| |
| // Mentions. |
| val beginOffsets = getValue("mentionsBeginOffsets").asInstanceOf[JList[Int]] |
| val contents = getValue("mentionsContents").asInstanceOf[JList[String]] |
| val types = getValue("mentionsTypes").asInstanceOf[JList[String]] |
| |
| require(beginOffsets.size() == contents.size()) |
| require(types.size() == contents.size()) |
| |
| val mentions = |
| beginOffsets.asScala.zip(contents.asScala).zip(types.asScala). |
| map { case ((o, c), t) => s"beginOffset=$o, content=$c, type=$t" }.mkString(", ") |
| |
| val sal = mkDouble3("salience") |
| |
| s"meta=[$meta], mentions=[$mentions], salience=$sal" |
| |
| case name if name.startsWith("stanford:") => |
| var s = s"confidence=${mkDouble3("confidence")}" |
| |
| mkStringOpt("nne") match { |
| case Some(nne) => s = s"$s, nne=$nne" |
| case None => // No-op. |
| } |
| |
| s |
| case name if name.startsWith("opennlp:") => |
| s"probability=${mkDouble3("probability")}" |
| |
| case name if name.startsWith("spacy:") => |
| var s = s"vector=${mkDouble3("vector")}, sentiment=${mkDouble3("sentiment")}" |
| |
| getValueOpt("meta") match { |
| case Some(m) => |
| val metaMap = m.asInstanceOf[java.util.Map[String, String]].asScala |
| |
| if (metaMap.nonEmpty) { |
| val v = metaMap.map(p => s"${p._1}=${p._2}").mkString(",") |
| |
| s = s"$s, meta=$v" |
| } |
| |
| case None => // No-op. |
| } |
| |
| s |
| |
| // User tokens. |
| case _ => "" |
| } |
| } |
| |
| val v = if (sorted.lengthCompare(1) > 0) vals2String(sorted) else sorted.map(p => s"${p._2}").mkString(", ") |
| |
| if (note.tokenFrom < note.tokenTo) { |
| if (note.tokenIndexes.tail.zipWithIndex.forall { case (v, i) => v == note.tokenIndexes(i) + 1 }) |
| s"$v ${s"<${note.tokenFrom} to ${note.tokenTo}>"}" |
| else |
| s"$v ${s"<${note.tokenIndexes.mkString(",")}>"}" |
| } |
| else |
| s"${if (v.isEmpty) "<>" else v}" |
| } |
| |
| private def mkCells(hs: Seq[NoteMetadata], t: NCNlpSentenceToken): Seq[String] = { |
| def filter(h: NoteMetadata): Iterable[NCNlpSentenceNote] = t.filter(_.noteType == h.noteType) |
| |
| hs.flatMap(h => |
| if (h.isFull) |
| Seq(filter(h).map(p => note2String(p)).mkString(", ")) |
| else |
| h.filtered. |
| map(p => filter(h).filter(_.contains(p)).map(n => n(p)).mkString(", ")) |
| ) |
| } |
| |
| /** |
| * Prepares table to print. |
| */ |
| def prepareTable(sen: NCNlpSentence): NCAsciiTable = { |
| val md = filterKeysPairs(sen.flatMap(t => t.map(n => for (vk <- n.keys) yield n.noteType -> vk)).flatten.toSeq.distinct) |
| |
| val tbl = mkTable(md) |
| |
| for (t <- sen) tbl += (mkCells(md, t): _*) |
| |
| tbl |
| } |
| |
| /** |
| * Prepares table to print. |
| */ |
| def prepareTable(toks: Seq[NCToken]): NCAsciiTable = { |
| val allFree = toks.forall(_.isFreeWord) |
| |
| val headers = mutable.ArrayBuffer.empty[String] ++ |
| Seq( |
| "idx", |
| "origtext", |
| "lemma", |
| "pos", |
| "quoted", |
| "stopword", |
| "freeword", |
| "wordindexes", |
| "direct", |
| "sparsity" |
| ) |
| |
| if (!allFree) |
| headers += "token data" |
| |
| val tbl = NCAsciiTable(headers) |
| |
| toks.foreach(tok => { |
| val md = tok.getMetadata |
| val id = tok.getId |
| |
| def mkFullName(name: String): String = s"$id:$name" |
| def get[T](name: String): T = md.get(mkFullName(name)).asInstanceOf[T] |
| def getOpt[T](name: String): Option[T] = { |
| val v = md.get(mkFullName(name)) |
| |
| if (v != null) Some(v.asInstanceOf[T]) else None |
| } |
| |
| def has(name: String): Boolean = md.containsKey(mkFullName(name)) |
| |
| def mkString(names: String*): String = names.flatMap(name => { |
| val opt = getOpt(name) |
| |
| opt |
| }).mkString("|") |
| |
| def getIndexes(name: String): String = { |
| val idxs: JList[String] = get(name) |
| |
| idxs.asScala.mkString(", ") |
| } |
| |
| def mkDouble3(name: String): Double = { |
| val d: Double = get(name) |
| |
| (d * 1000).intValue / 1000.0 |
| } |
| |
| val origTxtStr = |
| if (tok.isStopWord) |
| r(tok.origText) |
| else if (tok.isFreeWord) |
| y(tok.origText) |
| else |
| tok.origText |
| |
| val row = |
| Seq( |
| tok.index, |
| origTxtStr, |
| tok.lemma, |
| tok.pos, |
| tok.isQuoted, |
| if (tok.isStopWord) s"${r("true")}" else "false", |
| if (tok.isFreeWord) s"${y("true")}" else "false", |
| s"[${tok.wordIndexes.mkString(",")}]", |
| tok.isDirect, |
| tok.sparsity |
| ) |
| |
| if (allFree) |
| tbl += (row :_*) |
| else { |
| val v = |
| id match { |
| case "nlpcraft:nlp" => "" |
| |
| case "nlpcraft:continent" => mkString("continent") |
| case "nlpcraft:subcontinent" => mkString("continent", "subcontinent") |
| case "nlpcraft:country" => mkString("continent", "subcontinent", "country") |
| case "nlpcraft:region" => mkString("continent", "subcontinent", "country", "region") |
| case "nlpcraft:city" => mkString("continent", "subcontinent", "country", "region", "city") |
| case "nlpcraft:metro" => mkString("metro") |
| case "nlpcraft:date" => |
| val from = format(get("from")) |
| val to = format(get("to")) |
| val ps: JList[String] = get("periods") |
| |
| val r = s"$from:$to" |
| |
| s"range=$r, periods=${ps.asScala.mkString(",")}" |
| |
| case "nlpcraft:relation" => |
| val t = mkString("type") |
| val note = mkString("note") |
| |
| s"type=$t, indexes=[${getIndexes("indexes")}], note=$note" |
| |
| case "nlpcraft:sort" => |
| def x(l: JList[String]): String = l.asScala.mkString(", ") |
| |
| def getList(notesName: String, indexesName: String): String = { |
| val notesOpt: Option[JList[String]] = getOpt(notesName) |
| |
| notesOpt match { |
| case Some(notes) => |
| s"$notesName=${x(notes)}, $indexesName=[${getIndexes(indexesName)}]" |
| case None => "" |
| } |
| } |
| |
| var s = getList("subjnotes", "subjindexes") |
| val by = getList("bynotes", "byindexes") |
| |
| if (by.nonEmpty) |
| s = if (s.nonEmpty) s"$s, $by" else by |
| |
| require(s.nonEmpty) |
| |
| if (has("asc")) |
| s = s"$s, asc=${get("asc")}" |
| |
| s |
| case "nlpcraft:limit" => |
| val limit = mkDouble3("limit") |
| val note = mkString("note") |
| |
| var s = s"limit=$limit, indexes=[${getIndexes("indexes")}], note=$note" |
| |
| if (has("asc")) |
| s = s"$s, asc=${get("asc")}" |
| |
| s |
| |
| case "nlpcraft:num" => |
| def mkValue(name: String, fractionalField: String): String = { |
| val d: Double = get(name) |
| val fr: Boolean = get(fractionalField) |
| |
| if (fr) d.toString else d.toInt.toString |
| } |
| |
| val from = mkValue("from", "isfractional") |
| val to = mkValue("to", "isfractional") |
| val fromIncl: Boolean = get("fromincl") |
| val toIncl: Boolean = get("toincl") |
| val isRangeCond: Boolean = get("israngecondition") |
| val isEqCond: Boolean = get("isequalcondition") |
| val isNotEqCond: Boolean = get("isnotequalcondition") |
| val isFromNegInf: Boolean = get("isfromnegativeinfinity") |
| val isToPosInf: Boolean = get("istopositiveinfinity") |
| |
| val x1 = if (isFromNegInf) "-Infinity" else from |
| val x2 = if (isToPosInf) "+Infinity" else to |
| |
| var s = |
| if (isRangeCond) |
| s"${mkMore(fromIncl)}$x1 && ${mkLess(toIncl)}$x2" |
| else if (isEqCond) |
| s"=$x1" |
| else { |
| assert(isNotEqCond) |
| |
| s"!=$x1" |
| } |
| |
| if (has("unit")) { |
| val unit = mkString("unit") |
| val unitType = mkString("unittype") |
| |
| s = s"$s, unit=$unit($unitType)" |
| } |
| |
| s |
| case "nlpcraft:coordinate" => mkString("latitude", "longitude") |
| case name if name.startsWith("google:") => |
| val meta: java.util.Map[String, java.io.Serializable] = get("meta") |
| val metaS = meta.asScala.map(p => s"${p._1}=${p._2}").mkString(",") |
| |
| // Mentions. |
| val beginOffsets: JList[Int] = get("mentionsbeginoffsets") |
| val contents: JList[String] = get("mentionscontents") |
| val types: JList[String] = get("mentionstypes") |
| |
| require(beginOffsets.size() == contents.size()) |
| require(types.size() == contents.size()) |
| |
| val mentions = |
| beginOffsets.asScala.zip(contents.asScala).zip(types.asScala). |
| map { case ((o, c), t) => s"beginOffset=$o, content=$c, type=$t" }.mkString(", ") |
| |
| val sal = mkDouble3("salience") |
| |
| s"meta=[$metaS], mentions=[$mentions], salience=$sal" |
| case name if name.startsWith("opennlp:") => s"probability=${mkDouble3("probability")}" |
| case name if name.startsWith("stanford:") => |
| var s = s"confidence=${mkDouble3("confidence")}" |
| |
| if (has("nne")) |
| s = s"$s, nne=${get("nne")}" |
| |
| s |
| case name if name.startsWith("spacy:") => |
| var s = s"vector=${mkDouble3("vector")}, sentiment=${mkDouble3("sentiment")}" |
| |
| val metaOpt: Option[java.util.Map[String, String]] = getOpt("meta") |
| |
| metaOpt match { |
| case Some(m) => |
| val ms = m.asScala |
| |
| if (ms.nonEmpty) { |
| val v = ms.map(p => s"${p._1}=${p._2}").mkString(",") |
| |
| s = s"$s, meta=$v" |
| } |
| case None => // No-op. |
| } |
| |
| s |
| |
| // User defined token. |
| case _ => |
| def tok2Str(t: NCToken): String = { |
| var s = s"id=${t.getId}" |
| |
| t.meta(TOK_META_ALIASES_KEY).asInstanceOf[java.util.Set[String]] match { |
| case null => // No-op. |
| case aliases => s = s"$s, aliases='${aliases.asScala.mkString(",")}'" |
| } |
| |
| val parts = t.getPartTokens.asScala.map(tok2Str).mkString("|") |
| |
| if (parts.nonEmpty) |
| s = s"$s, parts=[$parts]" |
| |
| s |
| } |
| |
| tok2Str(tok) |
| } |
| |
| tbl += ( |
| ( |
| if (tok.getId == "nlpcraft:nlp") |
| row.map(_.toString) |
| else |
| row.map(s => s"${ansi256Fg(183)}${s.toString}${ansiReset}") |
| ) |
| ++ |
| // Token data. |
| Seq(if (tok.getId == "nlpcraft:nlp") "" else s"<<${ansi256Fg(183)}${tok.getId}$ansiReset>> $v") :_* |
| ) |
| } |
| }) |
| |
| tbl |
| } |
| } |