Add first draft of normalizer Java API
diff --git a/tf-ner-poc/pom.xml b/tf-ner-poc/pom.xml
index 45bdce0..8042da9 100644
--- a/tf-ner-poc/pom.xml
+++ b/tf-ner-poc/pom.xml
@@ -9,7 +9,7 @@
<version>1.0-SNAPSHOT</version>
<properties>
- <tensorflow.version>1.7.0</tensorflow.version>
+ <tensorflow.version>1.12.0</tensorflow.version>
</properties>
<dependencies>
@@ -43,6 +43,7 @@
<target>1.8</target>
</configuration>
</plugin>
+
</plugins>
</build>
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
new file mode 100644
index 0000000..2ad4809
--- /dev/null
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.normalizer;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.opennlp.ModelUtil;
+import org.tensorflow.SavedModelBundle;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+
+public class Normalizer {
+
+ private final Session session;
+ private final Map<Character, Integer> sourceCharMap;
+ private final Map<Integer, Character> targetCharMap;
+
+ Normalizer(InputStream sourceCharMapIn, InputStream targetCharMapIn,
+ InputStream modelZipPackage) throws IOException {
+
+ Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage);
+
+ SavedModelBundle model = SavedModelBundle.load(tmpModelPath.toString(), "serve");
+ session = model.session();
+
+ sourceCharMap = loadCharMap(sourceCharMapIn).entrySet()
+ .stream()
+ .collect(Collectors.toMap(Map.Entry::getValue, c -> c.getKey()));
+
+ targetCharMap = loadCharMap(targetCharMapIn);
+ }
+
+ private static Map<Integer, Character> loadCharMap(InputStream in) throws IOException {
+ try(BufferedReader reader = new BufferedReader(
+ new InputStreamReader(in, StandardCharsets.UTF_8))) {
+ Map<Integer, Character> characterMap = new HashMap<>();
+
+ String tag;
+ while ((tag = reader.readLine()) != null) {
+ characterMap.put(characterMap.size(), tag.charAt(0));
+ }
+
+ return Collections.unmodifiableMap(characterMap);
+ }
+ }
+
+ public String[] normalize(String[] texts) {
+
+ // TODO: Batch size is hard coded in the graph, make it dynamic or at padding here
+
+ int textLengths[] = Arrays.stream(texts).mapToInt(String::length).toArray();
+ int maxLength = Arrays.stream(textLengths).max().getAsInt();
+
+ int charIds[][] = new int[texts.length][maxLength];
+
+ for (int textIndex = 0; textIndex < texts.length; textIndex++) {
+ for (int charIndex = 0; charIndex < texts[textIndex].length(); charIndex++) {
+ charIds[textIndex][charIndex] = sourceCharMap.get(texts[textIndex].charAt(charIndex));
+ }
+
+ textLengths[textIndex] = texts[textIndex].length();
+ }
+
+ try (Tensor<?> charTensor = Tensor.create(charIds);
+ Tensor<?> textLength = Tensor.create(textLengths)) {
+
+ List<Tensor<?>> result = session.runner()
+ .feed("encoder_char_ids", charTensor)
+ .feed("encoder_lengths", textLength)
+ .fetch("decode", 0).run();
+
+ try (Tensor<?> translationTensor = result.get(0)) {
+ // TODO: This can't be hard coded ... normalized form doesn't need to have static length
+ int[][] translations =
+ translationTensor.copyTo(new int[texts.length][9]); // shape is (20, 9) in eval py code
+
+ List<String> normalizedTexts = new ArrayList<>();
+
+ for (int ti = 0; ti < translations.length; ti++) {
+ StringBuilder normalizedText = new StringBuilder();
+ for (int ci = 0; ci < translations[ti].length; ci++) {
+ normalizedText.append(targetCharMap.get(translations[ti][ci]));
+ }
+
+ normalizedTexts.add(normalizedText.toString());
+ }
+
+ return normalizedTexts.toArray(new String[normalizedTexts.size()]);
+ }
+ }
+ }
+}
diff --git a/tf-ner-poc/src/main/python/normalizer/normalizer.py b/tf-ner-poc/src/main/python/normalizer/normalizer.py
index 18e9338..86e735e 100644
--- a/tf-ner-poc/src/main/python/normalizer/normalizer.py
+++ b/tf-ner-poc/src/main/python/normalizer/normalizer.py
@@ -161,16 +161,6 @@
gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
optimize = optimizer.apply_gradients(zip(gradients, v))
- # decoder is here ...
- #helperE = tf.contrib.seq2seq.GreedyEmbeddingHelper(
- # decoder_embedding_weights,
- # tf.fill([batch_size], decoder_nchars-2), decoder_nchars-1)
- #decoderE = tf.contrib.seq2seq.BasicDecoder(
- # decoder_cell, helperE, encoder_state,
- # output_layer=projection_layer)
- #outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, maximum_iterations=15, output_time_major=True)
-
-
return encoder_char_ids_ph, encoder_lengths_ph, decoder_char_ids_ph, decoder_lengths, optimize, train_prediction, outputs
if "EVAL" == mode:
@@ -182,10 +172,11 @@
output_layer=projection_layer)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, maximum_iterations=15)
- translations = outputs.sample_id
+
+ translations = tf.identity(outputs.sample_id, name="decode")
# the outputs don't decode anything ...
- return encoder_char_ids_ph, encoder_lengths_ph, outputs
+ return encoder_char_ids_ph, encoder_lengths_ph, translations
def encode_chars(names):
char_set = set()
@@ -193,6 +184,12 @@
char_set = char_set.union(name)
return {k: v for v, k in enumerate(char_set)}
+# TODO: Deduplicate this, same as in namefinder.py
+def write_mapping(tags, output_filename):
+ with open(output_filename, 'w', encoding='utf-8') as f:
+ for i, tag in enumerate(tags):
+ f.write('{}\n'.format(tag))
+
def main():
checkpoints_path = "/tmp/model/checkpoints"
@@ -202,12 +199,17 @@
source_test, target_test = load_data("date_test.txt")
source_char_dict = encode_chars(source_train + source_dev + source_test)
+
+ write_mapping(source_char_dict, 'source_char_dict.txt')
+
target_char_dict = encode_chars(target_train + target_dev + target_test)
# TODO: Find better chars for begin and end markers
target_char_dict['S'] = len(target_char_dict)
target_char_dict['E'] = len(target_char_dict)
+ write_mapping(target_char_dict, 'target_char_dict.txt')
+
target_dict_rev = {v: k for k, v in target_char_dict.items()}
batch_size = 20
@@ -230,58 +232,61 @@
create_graph("EVAL", batch_size, len(source_char_dict), target_max_len, len(target_char_dict))
eval_saver = tf.train.Saver()
- eval_sess = tf.Session(graph=eval_graph)
+ eval_sess = tf.Session(graph=eval_graph)
-
- for epoch in range(200):
+ for epoch in range(30):
print("Epoch " + str(epoch))
- for batch_index in range(floor(len(source_train) / batch_size)):
- if batch_index > 0 and batch_index % 100 == 0:
- print("batch_index " + str(batch_index))
+ with train_graph.as_default():
+ for batch_index in range(floor(len(source_train) / batch_size)):
+ if batch_index > 0 and batch_index % 100 == 0:
+ print("batch_index " + str(batch_index))
- target_batch, target_length, source_batch, source_length = \
- mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index)
+ target_batch, target_length, source_batch, source_length = \
+ mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index)
- feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch,
- t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch}
+ feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch,
+ t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch}
- t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], feed_dict)
- dec2 = train_sess.run([t_dec_out], feed_dict)
- tv=1
+ t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], feed_dict)
+ dec2 = train_sess.run([t_dec_out], feed_dict)
+ tv=1
- # Save train model, and restore it into the eval session
- checkpoint_path = train_saver.save(train_sess, checkpoints_path, global_step=epoch)
- eval_saver.restore(eval_sess, checkpoint_path)
+ # Save train model, and restore it into the eval session
+ checkpoint_path = train_saver.save(train_sess, checkpoints_path, global_step=epoch)
+ eval_saver.restore(eval_sess, checkpoint_path)
- count_correct = 0
- for batch_index in range(floor(len(source_dev) / batch_size)):
- target_batch, target_length, source_batch, source_length = \
- mini_batch(target_char_dict, target_dev, source_char_dict, source_dev, batch_size, batch_index)
+ with eval_graph.as_default():
+ count_correct = 0
+ for batch_index in range(floor(len(source_dev) / batch_size)):
+ target_batch, target_length, source_batch, source_length = \
+ mini_batch(target_char_dict, target_dev, source_char_dict, source_dev, batch_size, batch_index)
- begin = batch_index
- end = min(batch_index + batch_size, len(source_dev))
- target_strings = target_dev[begin:end]
+ begin = batch_index
+ end = min(batch_index + batch_size, len(source_dev))
+ target_strings = target_dev[begin:end]
+ feed_dict = {e_encoder_lengths_ph: source_length, e_encoder_char_ids_ph: source_batch}
+ result = eval_sess.run(e_dec_out, feed_dict)
- feed_dict = {e_encoder_lengths_ph: source_length, e_encoder_char_ids_ph: source_batch}
- result = eval_sess.run(e_dec_out, feed_dict)
+ decoded_dates = []
+ for coded_date in result:
+ date = ""
+ for char_id in coded_date:
+ if not char_id == len(target_char_dict) - 1:
+ date = date + (target_dict_rev[char_id])
+ decoded_dates.append(date)
- decoded_dates = []
+ for i in range(len(target_strings)):
+ if target_strings[i] == decoded_dates[i]:
+ count_correct = count_correct + 1
- for coded_date in result.sample_id:
- date = ""
- for char_id in coded_date:
- if not char_id == len(target_char_dict) - 1:
- date = date + (target_dict_rev[char_id])
- decoded_dates.append(date)
+ print("Dev: " + str(count_correct / len(target_dev)))
- for i in range(len(target_strings)):
- if target_strings[i] == decoded_dates[i]:
- count_correct = count_correct + 1
-
- print("Dev: " + str(count_correct / len(target_dev)))
+ builder = tf.saved_model.builder.SavedModelBuilder("./normalizer_model" + str(epoch))
+ builder.add_meta_graph_and_variables(eval_sess, [tf.saved_model.tag_constants.SERVING])
+ builder.save()
if __name__ == "__main__":
main()