| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| # This poc is based on source code taken from: |
| # https://github.com/guillaumegenthial/sequence_tagging |
| |
| import sys |
| from math import floor |
| import tensorflow as tf |
| import re |
| import numpy as np |
| |
| # Parse the OpenNLP Name Finder format into begin, end, type triples |
| class NameSample: |
| |
| def __init__(self, line): |
| self.tokens = [] |
| self.names = [] |
| start_regex = re.compile("<START(:([^:>\\s]*))?>") |
| parts = line.split() |
| start_index = -1 |
| word_index = 0 |
| for i in range(0, len(parts)): |
| if start_regex.match(parts[i]): |
| start_index = word_index |
| name_type = start_regex.search(parts[i]).group(2); |
| if None == name_type: |
| name_type = "default" |
| elif parts[i] == "<END>": |
| self.names.append((start_index, word_index, name_type)) |
| else: |
| self.tokens.append(parts[i]) |
| word_index += 1 |
| |
| class NameFinder: |
| |
| def __init__(self): |
| self.label_dict = {} |
| |
| def load_glove(self, glove_file): |
| with open(glove_file) as f: |
| |
| word_dict = {} |
| embeddings = [] |
| |
| for line in f: |
| parts = line.strip().split(" ") |
| word_dict[parts[0]] = len(word_dict) |
| embeddings.append(np.array(parts[1:], dtype=np.float32)) |
| |
| # Create a reverse word dict |
| rev_word_dict = {} |
| for word, id in word_dict.items(): |
| rev_word_dict[id] = word |
| |
| return word_dict, rev_word_dict, np.asarray(embeddings) |
| |
| def load_data(self, word_dict, file): |
| with open(file) as f: |
| raw_data = f.readlines() |
| |
| sentences = [] |
| labels = [] |
| chars_set = set() |
| |
| for line in raw_data: |
| name_sample = NameSample(line) |
| sentence = [] |
| |
| if len(name_sample.tokens) == 0: |
| continue |
| |
| for token in name_sample.tokens: |
| vector = 0 |
| if word_dict.get(token) is not None: |
| vector = word_dict[token] |
| |
| sentence.append(vector) |
| |
| for c in token: |
| chars_set.add(c) |
| |
| label = ["other"] * len(name_sample.tokens) |
| for name in name_sample.names: |
| label[name[0]] = "B-" + name[2] |
| for i in range(name[0] + 1, name[1]): |
| label[i] = "I-" + name[2] |
| sentences.append(sentence) |
| labels.append(label) |
| |
| for label_string in label: |
| if not label_string in self.label_dict: |
| self.label_dict[label_string] = len(self.label_dict) |
| |
| return sentences, labels, chars_set |
| |
| def encode_labels(self, labels): |
| label_ids = [] |
| for label in labels: |
| label_ids.append(self.label_dict[label]) |
| |
| return label_ids |
| |
| |
| def mini_batch(self, rev_word_dict, char_dict, sentences, labels, batch_size, batch_index): |
| begin = batch_size * batch_index |
| end = min(batch_size * (batch_index + 1), len(labels)) |
| |
| # Determine the max sentence length in the batch |
| max_length = 0 |
| for i in range(begin, end): |
| length = len(sentences[i]) |
| if length > max_length: |
| max_length = length |
| |
| sb = [] |
| lb = [] |
| seq_length = [] |
| for i in range(begin, end): |
| sb.append(sentences[i] + [0] * max(max_length - len(sentences[i]), 0)) |
| lb.append(self.encode_labels(labels[i]) + [0] * max(max_length - len(labels[i]), 0)) |
| seq_length.append(len(sentences[i])) |
| |
| # Determine the max word length in the batch |
| max_word_length = 0 |
| for i in range(begin, end): |
| for word in sentences[i]: |
| length = len(rev_word_dict[word]) |
| if length > max_word_length: |
| max_word_length = length |
| |
| cb = [] |
| wlb = [] |
| for i in range(begin, end): |
| sentence_word_length = [] |
| sentence_word_chars = [] |
| for word in sentences[i]: |
| |
| word_chars = [] |
| for c in rev_word_dict[word]: |
| word_chars.append(char_dict[c]) # TODO: This fails if c is not present |
| |
| sentence_word_length.append(len(word_chars)) |
| word_chars = word_chars + [0] * max(max_word_length - len(word_chars), 0) |
| sentence_word_chars.append(word_chars) |
| |
| for i in range(max(max_length - len(sentence_word_chars), 0)): |
| sentence_word_chars.append([0] * max_word_length) |
| |
| cb.append(sentence_word_chars) |
| wlb.append(sentence_word_length + [0] * max(max_length - len(sentence_word_length), 0)) |
| |
| return sb, cb, wlb, lb, seq_length |
| |
| |
| def create_graph(self, nchars, embedding_dict): # probably not necessary to pass in the embedding_dict, can be passed to init directly |
| |
| |
| with tf.variable_scope("chars"): |
| # shape = (batch size, max length of sentence, max length of word) |
| char_ids = tf.placeholder(tf.int32, shape=[None, None, None]) |
| |
| # shape = (batch_size, max_length of sentence) |
| word_lengths_ph = tf.placeholder(tf.int32, shape=[None, None]) |
| |
| dim_char = 100 |
| |
| # 1. get character embeddings |
| K = tf.get_variable(name="char_embeddings", dtype=tf.float32, |
| shape=[nchars, dim_char]) |
| |
| # shape = (batch, sentence, word, dim of char embeddings) |
| char_embeddings = tf.nn.embedding_lookup(K, char_ids) |
| |
| # 2. put the time dimension on axis=1 for dynamic_rnn |
| s = tf.shape(char_embeddings) # store old shape |
| # shape = (batch x sentence, word, dim of char embeddings) |
| char_embeddings = tf.reshape(char_embeddings, shape=[s[0]*s[1], s[-2], dim_char]) |
| word_lengths = tf.reshape(word_lengths_ph, shape=[s[0]*s[1]]) |
| |
| # 3. bi lstm on chars |
| char_hidden_size = 100 |
| cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) |
| cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) |
| |
| _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, |
| cell_bw, |
| char_embeddings, |
| sequence_length=word_lengths, |
| dtype=tf.float32) |
| # shape = (batch x sentence, 2 x char_hidden_size) |
| output = tf.concat([output_fw, output_bw], axis=-1) |
| |
| # shape = (batch, sentence, 2 x char_hidden_size) |
| char_rep = tf.reshape(output, shape=[-1, s[1], 2*char_hidden_size]) |
| |
| with tf.variable_scope("words"): |
| token_ids = tf.placeholder(tf.int32, shape=[None, None]) |
| sequence_lengths = tf.placeholder(tf.int32, shape=[None]) |
| |
| # This is a hack to make it load an embedding matrix larger than 2GB |
| # Don't hardcode this 300 |
| embedding_placeholder = tf.placeholder(dtype=tf.float32, name="embedding_placeholder", |
| shape=(len(embedding_dict), 100)) |
| embedding_matrix = tf.Variable(embedding_placeholder, dtype=tf.float32, trainable=False, name="glove_embeddings") |
| |
| token_embeddings = tf.nn.embedding_lookup(embedding_matrix, token_ids) |
| |
| # shape = (batch, sentence, 2 x char_hidden_size + word_vector_size) |
| word_embeddings = tf.concat([token_embeddings, char_rep], axis=-1) |
| |
| word_embeddings = tf.nn.dropout(word_embeddings, 0.5) |
| |
| hidden_size = 300 |
| |
| # Lets add a char lstm layer to reproduce the state of the art results ... |
| |
| with tf.variable_scope("bi-lstm"): |
| # Add LSTM layer |
| cell_fw = tf.contrib.rnn.LSTMCell(hidden_size) |
| cell_bw = tf.contrib.rnn.LSTMCell(hidden_size) |
| |
| (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, word_embeddings, |
| sequence_length=sequence_lengths, dtype=tf.float32) |
| |
| context_rep = tf.concat([output_fw, output_bw], axis=-1) |
| |
| context_rep = tf.nn.dropout(context_rep, 0.5) |
| |
| labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") |
| |
| ntags = 7; # TODO: Compute this and not hard code |
| |
| W = tf.get_variable("W", shape=[2*hidden_size, ntags], dtype=tf.float32) |
| b = tf.get_variable("b", shape=[ntags], dtype=tf.float32, initializer=tf.zeros_initializer()) |
| ntime_steps = tf.shape(context_rep)[1] |
| context_rep_flat = tf.reshape(context_rep, [-1, 2*hidden_size]) |
| pred = tf.matmul(context_rep_flat, W) + b |
| self.logits = tf.reshape(pred, [-1, ntime_steps, ntags]) |
| |
| log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( |
| self.logits, labels, sequence_lengths) |
| |
| self.transition_params = transition_params |
| |
| loss = tf.reduce_mean(-log_likelihood) |
| |
| train_op = tf.train.AdamOptimizer().minimize(loss) |
| |
| return embedding_placeholder, token_ids, char_ids, word_lengths_ph, \ |
| sequence_lengths, labels, train_op |
| |
| def predict_batch(self, sess, token_ids_ph, char_ids_ph, word_lengths_ph, |
| sequence_lengths_ph, sentences, char_ids, word_length, lengths): |
| |
| feed_dict = {token_ids_ph: sentences, char_ids_ph: char_ids, word_lengths_ph: word_length, |
| sequence_lengths_ph: lengths} |
| |
| viterbi_sequences = [] |
| logits, trans_params = sess.run([self.logits, self.transition_params], feed_dict=feed_dict) |
| |
| for logit, sequence_length in zip(logits, lengths): |
| if sequence_length != 0: |
| logit = logit[:sequence_length] # keep only the valid steps |
| viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logit, trans_params) |
| viterbi_sequences += [viterbi_seq] |
| else: |
| viterbi_sequences += [] |
| |
| return viterbi_sequences, lengths |
| |
| def get_chunk_type(tok, idx_to_tag): |
| tag_name = idx_to_tag[tok] |
| tag_class = tag_name.split('-')[0] |
| tag_type = tag_name.split('-')[-1] |
| return tag_class, tag_type |
| |
| def get_chunks(seq, tags): |
| default = tags["other"] |
| idx_to_tag = {idx: tag for tag, idx in tags.items()} |
| chunks = [] |
| chunk_type, chunk_start = None, None |
| for i, tok in enumerate(seq): |
| # End of a chunk 1 |
| if tok == default and chunk_type is not None: |
| # Add a chunk. |
| chunk = (chunk_type, chunk_start, i) |
| chunks.append(chunk) |
| chunk_type, chunk_start = None, None |
| |
| # End of a chunk + start of a chunk! |
| elif tok != default: |
| tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) |
| if chunk_type is None: |
| chunk_type, chunk_start = tok_chunk_type, i |
| elif tok_chunk_type != chunk_type or tok_chunk_class == "B": |
| chunk = (chunk_type, chunk_start, i) |
| chunks.append(chunk) |
| chunk_type, chunk_start = tok_chunk_type, i |
| else: |
| pass |
| |
| # end condition |
| if chunk_type is not None: |
| chunk = (chunk_type, chunk_start, len(seq)) |
| chunks.append(chunk) |
| |
| return chunks |
| |
| def write_mapping(tags, output_filename): |
| with open(output_filename, 'w', encoding='utf-8') as f: |
| for i, tag in enumerate(tags): |
| f.write('{}\n'.format(tag)) |
| |
| def main(): |
| |
| if len(sys.argv) != 5: |
| print("Usage namefinder.py embedding_file train_file dev_file test_file") |
| return |
| |
| name_finder = NameFinder() |
| |
| word_dict, rev_word_dict, embeddings = name_finder.load_glove(sys.argv[1]) |
| sentences, labels, char_set = name_finder.load_data(word_dict, sys.argv[2]) |
| sentences_dev, labels_dev, char_set_dev = name_finder.load_data(word_dict, sys.argv[3]) |
| |
| char_dict = {k: v for v, k in enumerate(char_set | char_set_dev)} |
| |
| embedding_ph, token_ids_ph, char_ids_ph, word_lengths_ph, sequence_lengths_ph, labels_ph, train_op \ |
| = name_finder.create_graph(len(char_set | char_set_dev), embeddings) |
| |
| write_mapping(word_dict, 'word_dict.txt') |
| write_mapping(name_finder.label_dict, "label_dict.txt") |
| write_mapping(name_finder.label_dict, "char_dict.txt") |
| |
| sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, |
| log_device_placement=True)) |
| |
| with sess.as_default(): |
| init = tf.global_variables_initializer() |
| sess.run(init, feed_dict={embedding_ph: embeddings}) |
| |
| batch_size = 20 |
| for epoch in range(100): |
| print("Epoch " + str(epoch)) |
| |
| for batch_index in range(floor(len(sentences) / batch_size)): |
| if batch_index % 200 == 0: |
| print("batch_index " + str(batch_index)) |
| |
| # mini_batch should also return char_ids and word length ... |
| sentences_batch, chars_batch, word_length_batch, labels_batch, lengths = \ |
| name_finder.mini_batch(rev_word_dict, char_dict, sentences, labels, batch_size, batch_index) |
| |
| feed_dict = {token_ids_ph: sentences_batch, char_ids_ph: chars_batch, word_lengths_ph: word_length_batch, sequence_lengths_ph: lengths, |
| labels_ph: labels_batch} |
| |
| train_op.run(feed_dict, sess) |
| |
| |
| accs = [] |
| correct_preds, total_correct, total_preds = 0., 0., 0. |
| for batch_index in range(floor(len(sentences_dev) / batch_size)): |
| sentences_test_batch, chars_batch_test, word_length_batch_test, \ |
| labels_test_batch, length_test = name_finder.mini_batch(rev_word_dict, |
| char_dict, |
| sentences_dev, |
| labels_dev, |
| batch_size, |
| batch_index) |
| |
| labels_pred, sequence_lengths = name_finder.predict_batch( |
| sess, token_ids_ph, char_ids_ph, word_lengths_ph, sequence_lengths_ph, |
| sentences_test_batch, chars_batch_test, word_length_batch_test, length_test) |
| |
| for lab, lab_pred, length in zip(labels_test_batch, labels_pred, |
| sequence_lengths): |
| lab = lab[:length] |
| lab_pred = lab_pred[:length] |
| accs += [a==b for (a, b) in zip(lab, lab_pred)] |
| |
| lab_chunks = set(get_chunks(lab, name_finder.label_dict)) |
| lab_pred_chunks = set(get_chunks(lab_pred, name_finder.label_dict)) |
| |
| correct_preds += len(lab_chunks & lab_pred_chunks) |
| total_preds += len(lab_pred_chunks) |
| total_correct += len(lab_chunks) |
| |
| p = correct_preds / total_preds if correct_preds > 0 else 0 |
| r = correct_preds / total_correct if correct_preds > 0 else 0 |
| f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 |
| acc = np.mean(accs) |
| |
| print("ACC " + str(acc)) |
| print("F1 " + str(f1) + " P " + str(p) + " R " + str(r)) |
| |
| # TODO: Store the model, load it with java ... |
| |
| if __name__ == "__main__": |
| main() |