blob: 37d030a90467aee0454564278a5b345b1d12cdc9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.server.geo.tools
import java.io.File
import com.fasterxml.jackson.core.`type`.TypeReference
import net.liftweb.json._
import org.apache.nlpcraft.common.nlp.dict.NCDictionaryManager
import org.apache.nlpcraft.common.{NCE, U}
import org.apache.nlpcraft.server.geo._
import scala.collection._
/**
* Generator of additional synonyms for geo names.
*/
object NCGeoSyntheticNamesGenerator extends App {
// Base synonym should be saved for console debug message.
case class Holder(base: String, var entries: Set[NCGeoEntry])
private def process(outFile: String) {
val file = new File(outFile)
if (file.exists() && !file.delete())
throw new NCE(s"Couldn't delete file: $file")
NCDictionaryManager.start()
NCGeoManager.start()
val hs = mutable.Map.empty[String, Holder]
println(s"Synonyms count: ${NCGeoManager.getModel.synonyms.size}")
for ((synonym, entries)NCGeoManager.getModel.synonyms) {
val strs2Process = mutable.Set.empty[String] + synonym
def add(s: String, base: String) =
if (!NCGeoManager.getModel.synonyms.contains(s) && !hs.contains(s)) {
strs2Process += s
hs.get(s) match {
case Some(syn) ⇒ syn.entries ++= entries
case None ⇒ hs += s → Holder(base, entries)
}
}
def generateDash(str: String) {
def generate(a: String, b: String): Unit =
if (str.contains(a))
add(str.replaceAll(a, b), str)
generate(" ", "-")
generate("-", " ")
}
def generateSaints(str: String) {
def generate(str: String, beginStr: String, replacements: String*): Unit =
if (str.startsWith(beginStr))
replacements.foreach(r ⇒ add(str.replaceFirst(beginStr, r), str))
generate(str, "st. ", "saint ", "saint-", "st.", "st-", "st ")
generate(str, "saint ", "saint-", "st. ", "st.", "st-", "st ")
if (str.length > 3 && str(3) != ' ' && str(3) != '-')
generate(str, "st.", "saint ", "saint-", "st. ", "st-", "st ")
}
while (strs2Process.nonEmpty) {
val str = strs2Process.last
strs2Process.remove(str)
generateDash(str)
generateSaints(str)
}
}
NCGeoManager.stop()
NCDictionaryManager.stop()
if (hs.nonEmpty) {
printResults(hs)
writeJson(hs, outFile)
}
else
println("All synthetic names already generated. Nothing to add.")
}
private def writeJson(buf: Map[String, Holder], outFile: String) {
val syns = mutable.Map.empty[NCGeoEntry, NCGeoSynonym]
buf.foreach(p ⇒ {
val s: String = p._1
val es: Set[NCGeoEntry] = p._2.entries
for (e ← es) {
syns.get(e) match {
case Some(syn) ⇒ syn.synonyms :+= s
case None
val synonym = e match {
case e: NCGeoMetro
NCGeoSynonym(None, None, None, None, None, Some(e.name), List(s))
case e: NCGeoContinent
NCGeoSynonym(None, None, None, None, Some(e.name), None, List(s))
case e: NCGeoSubContinent
NCGeoSynonym(None, None, None, Some(e.name), Some(e.continent.name), None, List(s))
// Short representation (without subcontinent and continent.)
case e: NCGeoCountry
NCGeoSynonym(None, None, Some(e.name), None, None, None, List(s))
// Short representation (without subcontinent and continent.)
case e: NCGeoRegion
NCGeoSynonym(None, Some(e.name), Some(e.country.name), None, None, None, List(s))
// Short representation (without subcontinent and continent.)
case e: NCGeoCity
NCGeoSynonym(
Some(e.name), Some(e.region.name), Some(e.region.country.name), None, None, None, List(s)
)
case _ ⇒ throw new AssertionError(s"Unexpected object: $e")
}
syns += e → synonym
}
}
})
// Required for Lift JSON processing.
implicit val formats: DefaultFormats.type = net.liftweb.json.DefaultFormats
val f = new File(outFile)
val exists =
if (f.exists())
U.extractYamlFile(f, ignoreCase = false, new TypeReference[List[NCGeoSynonym]] {})
else
Seq.empty[NCGeoSynonym]
U.getYamlMapper.writeValue(new File(outFile), (syns.values ++ exists).toSet)
}
private def printResults(buf: Map[String, Holder]) {
val map = mutable.Map.empty[String, Seq[String]]
buf.map(p ⇒ {
val baseSyn = p._2.base
val newSyn = p._1
map.get(baseSyn) match {
case Some(seq) ⇒ map += baseSyn → (seq :+ newSyn)
case None ⇒ map += baseSyn → Seq(newSyn)
}
})
map.toSeq.sortBy(_._1).foreach(p ⇒ {
val s = p._2.map(p ⇒ s"'$p'").mkString(", ")
println(s"Synonyms added: $s for base: '${p._1}'.")
})
println(s"Synonyms count: ${buf.size}.")
}
process(
U.mkPath(s"nlpcraft/src/main/resources/geo/synonyms/synthetic.yaml")
)
}