Add char dropout to handle unknown chars to normalizer
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
index cf182a6..5629a06 100644
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
@@ -91,7 +91,8 @@
 
     for (int textIndex = 0; textIndex < texts.length; textIndex++) {
       for (int charIndex = 0; charIndex < texts[textIndex].length(); charIndex++) {
-        charIds[textIndex][charIndex] = sourceCharMap.get(texts[textIndex].charAt(charIndex));
+        charIds[textIndex][charIndex] =
+                sourceCharMap.getOrDefault(texts[textIndex].charAt(charIndex), 0);
       }
 
       textLengths[textIndex] = texts[textIndex].length();
diff --git a/tf-ner-poc/src/main/python/normalizer/normalizer.py b/tf-ner-poc/src/main/python/normalizer/normalizer.py
index 8286ce1..6664cb5 100644
--- a/tf-ner-poc/src/main/python/normalizer/normalizer.py
+++ b/tf-ner-poc/src/main/python/normalizer/normalizer.py
@@ -23,7 +23,7 @@
 
 import tensorflow as tf
 import numpy as np
-
+import random
 from math import floor
 
 def load_data(file):
@@ -202,6 +202,7 @@
     source_test, target_test = load_data("date_test.txt")
 
     source_char_dict = encode_chars(source_train + source_dev + source_test)
+    source_char_dict[chr(0)] = 0
 
     target_char_dict = encode_chars(target_train + target_dev + target_test)
 
@@ -244,6 +245,11 @@
                 target_batch, target_length, source_batch, source_length = \
                     mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index)
 
+                # TODO: Add char dropout here ...
+                for i, j in np.ndindex(source_batch.shape):
+                    if random.uniform(0, 1) <= 0.0005:
+                        source_batch[i][j] = 0
+
                 feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch,
                              t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch}