Add a script to generate date normalization data
diff --git a/tf-ner-poc/src/main/python/normalizer/date_generator.py b/tf-ner-poc/src/main/python/normalizer/date_generator.py
new file mode 100644
index 0000000..3526f3b
--- /dev/null
+++ b/tf-ner-poc/src/main/python/normalizer/date_generator.py
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from faker import Faker
+from babel.dates import format_date
+import random
+from datetime import datetime
+
+fake = Faker()
+
+# TOOD: If possible set date range on Faker
+
+FORMATS = ['short',
+ 'medium',
+ 'long',
+ 'dd MMM YYY',
+ 'dd MMM',
+ 'dd, MMM YYY',
+ 'dd, MMM',
+ 'd MMM YYY',
+ 'd MMM',
+ 'd MMMM YYY',
+ 'MMMM YYY',
+ 'd MMMM, YYY',
+ 'd MMM, YYY',
+ 'd MM YY',
+ 'd MMMM YYY',
+ 'MMMM d YYY',
+ 'MMMM YYY',
+ 'MMM YYY',
+ 'MMMM d, YYY',
+ 'dd.MM.YY',
+ 'dd.MM',
+ 'full',
+ 'full',
+ 'full',
+ 'full',
+ 'full',
+ 'full',
+ 'full',
+ 'full',
+ 'full',
+ 'full']
+
+# TODO: maybe avoid duplicates, output dates also for other locales such as german, and french ...
+
+with open('date_dev_deu.txt', 'w', encoding="utf-8") as f:
+ for i in range(2000):
+ dt = fake.date_time_ad(start_datetime=datetime(1900, 1, 1))
+
+ format = random.choice(FORMATS)
+ source_date = format_date(dt, format=format, locale='de_DE')
+ target_date = format_date(dt, format='YYYYMMdd', locale='en_US')
+
+ if "short" not in format \
+ and "medium" not in format \
+ and "long" not in format \
+ and "full" not in format :
+
+ if "Y" not in format:
+ target_date = "0000" + target_date[4:]
+
+ if "d" not in format:
+ target_date = target_date[:6] + "00"
+
+
+ f.write(target_date + '\t' + source_date + '\n')
\ No newline at end of file