WIP.

commit: ead6b356bc7cd30c7b198cc0406e68cdc7d33f4c [log] [tgz]
author: skhdl <skhdlemail@gmail.com> Sun Oct 23 12:20:46 2022 +0400
committer: skhdl <skhdlemail@gmail.com> Sun Oct 23 12:20:46 2022 +0400
tree: 8c6dd66c5bb90224924e43821ca0db3b2e68a212
parent: 37bad466ebdf4825a2d0dfc7419fd31c0f440456 [diff]
diff --git a/custom-components.html b/custom-components.html
index 71cced3..9e0030e 100644
--- a/custom-components.html
+++ b/custom-components.html

@@ -24,13 +24,171 @@
 <div class="col-md-8 second-column">
     <section id="overview">
         <h2 class="section-title">Custom components <a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+
+        <p>
+            NlpCraft provides a numeric of useful built components for English language.
+            You can use them to prepare <code>Pipeline</code> for your <code>Model</code>.
+            You also can use provided wrappers on <a href="https://opennlp.apache.org/">Apache OpenNLP</a> and
+            <a href="https://nlp.stanford.edu/">Stanford NLP</a> projects NER components.
+            Their models work with English and some another languages.
+        </p>
+        <p>
+            But you can need to extend provided functionality and develop your own components.
+            Let's review these components step by step.
+        </p>
     </section>
+    <section id="token-parser">
+        <h2 class="section-title">Token parser <a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCTokenParser.html">NCTokenParser</a> trait.
+        </p>
+        <p>
+            It's not often situation when you need to prepare your own language tokenizer.
+            Mostly it can be necessary if you want to work with some new language.
+            You have to prepare new implementation once and can use it for all projects on this language.
+            Usually you just should find open source solution and wrap it for
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCTokenParser.html">NCTokenParser</a> trait.
+        </p>
+        <pre class="brush: scala, highlight: [2, 6]">
+            import org.apache.nlpcraft.*
+            import org.languagetool.tokenizers.fr.FrenchWordTokenizer
+            import scala.jdk.CollectionConverters.*
+
+            class NCFrTokenParser extends NCTokenParser:
+                private val tokenizer = new FrenchWordTokenizer
+
+                override def tokenize(text: String): List[NCToken] =
+                    val toks = collection.mutable.ArrayBuffer.empty[NCToken]
+                    var sumLen = 0
+
+                    for ((word, idx) <- tokenizer.tokenize(text).asScala.zipWithIndex)
+                        val start = sumLen
+                        val end = sumLen + word.length
+
+                        if word.strip.nonEmpty then
+                            toks += new NCPropertyMapAdapter with NCToken:
+                                override def getText: String = word
+                                override def getIndex: Int = idx
+                                override def getStartCharIndex: Int = start
+                                override def getEndCharIndex: Int = end
+
+                        sumLen = end
+
+                    toks.toList
+        </pre>
+        <ul>
+            <li>
+                <code>NCFrTokenParser</code> is a simple wrapper which implements <code>NCTokenParser</code> based on
+                open source <a href="https://languagetool.org">Language Tool</a> library.
+            </li>
+        </ul>
+    </section>
+
+    <section id="token-enricher">
+        <h2 class="section-title">Token enricher <a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCTokenEnricher.html">NCTokenEnricher</a> trait.
+        </p>
+        <p>
+            <a href="apis/latest/org/apache/nlpcraft/NCToken.html">NCToken</a> are used in
+            <a href="intent-matching.html">Intent matching</a>. NlpCraft provides a numeric of built token enricher
+            implementations for English language.
+            You may want to create your own or extends existing. Look at the following example:
+        </p>
+        <pre class="brush: scala, highlight: [25, 26]">
+            import org.apache.nlpcraft.*
+            import org.languagetool.AnalyzedToken
+            import org.languagetool.tagging.ru.RussianTagger
+            import scala.jdk.CollectionConverters.*
+
+            class NCRuLemmaPosTokenEnricher extends NCTokenEnricher:
+                private def nvl(v: String, dflt : => String): String = if v != null then v else dflt
+
+                override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
+                    val tags = RussianTagger.INSTANCE.tag(toks.map(_.getText).asJava).asScala
+
+                    require(toks.size == tags.size)
+
+                    toks.zip(tags).foreach { case (tok, tag) =>
+                        val readings = tag.getReadings.asScala
+
+                        val (lemma, pos) = readings.size match
+                            // No data. Lemma is word as is, POS is undefined.
+                            case 0 => (tok.getText, "")
+                            // Takes first. Other variants ignored.
+                            case _ =>
+                                val aTok: AnalyzedToken = readings.head
+                                (nvl(aTok.getLemma, tok.getText), nvl(aTok.getPOSTag, ""))
+
+                        tok.put("pos", pos)
+                        tok.put("lemma", lemma)
+
+                        () // Otherwise NPE.
+                    }
+        </pre>
+        <ul>
+            <li>
+                <code>Lines 25 and 26</code> enriches <a href="apis/latest/org/apache/nlpcraft/NCToken.html">NCToken</a>
+                by two new properties which can be used for <a href="intent-matching.html">Intent matching</a> later.
+            </li>
+        </ul>
+    </section>
+
+    <section id="token-validator">
+        <h2 class="section-title">Token validator <a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCTokenValidator.html">NCTokenValidator</a> trait.
+        </p>
+    </section>
+
+    <section id="entity-parser">
+        <h2 class="section-title">Entity parser <a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCEntityParser.html">NCEntityParser</a> trait.
+        </p>
+    </section>
+
+    <section id="entity-enricher">
+        <h2 class="section-title">Entity enricher <a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCEntityEnricher.html">NCEntityEnricher</a> trait.
+        </p>
+    </section>
+
+    <section id="entity-mapper">
+        <h2 class="section-title">Entity enricher<a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCEntityMapper.html">NCEntityMapper</a> trait.
+        </p>
+    </section>
+
+    <section id="entity-validator">
+        <h2 class="section-title">Entity validator<a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCEntityValidator.html">NCEntityValidator</a> trait.
+        </p>
+    </section>
+
+    <section id="variant-filter">
+        <h2 class="section-title">Variant filter<a href="#"><i class="top-link fas fa-fw fa-angle-double-up"></i></a></h2>
+        <p>
+            You have to implement <a href="apis/latest/org/apache/nlpcraft/NCVariantFilter.html">NCVariantFilter</a> trait.
+        </p>
+    </section>
+
 </div>
 <div class="col-md-2 third-column">
     <ul class="side-nav">
         <li class="side-nav-title">On This Page</li>
         <li><a href="#overview">Overview</a></li>
-
+        <li><a href="#token-parser">Token parser</a></li>
+        <li><a href="#token-enricher">Token enricher</a></li>
+        <li><a href="#token-validator">Token validator</a></li>
+        <li><a href="#entity-parser">Entity parser</a></li>
+        <li><a href="#entity-enricher">Entity enricher</a></li>
+        <li><a href="#entity-mapper">Entity mapper</a></li>
+        <li><a href="#entity-validator">Entity validator</a></li>
+        <li><a href="#variant-filter">Variant filter</a></li>
         {% include quick-links.html %}
     </ul>
 </div>
commit	ead6b356bc7cd30c7b198cc0406e68cdc7d33f4c	[log] [tgz]
author	skhdl <skhdlemail@gmail.com>	Sun Oct 23 12:20:46 2022 +0400
committer	skhdl <skhdlemail@gmail.com>	Sun Oct 23 12:20:46 2022 +0400
tree	8c6dd66c5bb90224924e43821ca0db3b2e68a212
parent	37bad466ebdf4825a2d0dfc7419fd31c0f440456 [diff]