example/named_entity_recognition/src/preprocess.py - mxnet - Git at Google

 # !/usr/bin/env python

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # -*- coding: utf-8 -*-

 import pandas as pd
 import numpy as np

 #read in csv of NER training data
 df = pd.read_csv("../data/ner_dataset.csv", encoding="ISO-8859-1")

 #rename columns
 df = df.rename(columns = {"Sentence #" : "utterance_id",
                             "Word" : "token",
                             "POS" : "POS_tag",
                             "Tag" : "BILOU_tag"})

 #clean utterance_id column
 df.loc[:, "utterance_id"] = df["utterance_id"].str.replace('Sentence: ', '')

 #fill np.nan utterance ID's with the last valid entry
 df = df.fillna(method='ffill')
 df.loc[:, "utterance_id"] = df["utterance_id"].apply(int)

 #melt BILOU tags and tokens into an array per utterance
 df1 = df.groupby("utterance_id")["BILOU_tag"].apply(lambda x: np.array(x)).to_frame().reset_index()
 df2 = df.groupby("utterance_id")["token"].apply(lambda x: np.array(x)).to_frame().reset_index()
 df3 = df.groupby("utterance_id")["POS_tag"].apply(lambda x: np.array(x)).to_frame().reset_index()

 #join the results on utterance id
 df = df1.merge(df2.merge(df3, how = "left", on = "utterance_id"), how = "left", on = "utterance_id")

 #save the dataframe to a csv file
 df.to_pickle("../data/ner_data.pkl")
	# !/usr/bin/env python

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# -- coding: utf-8 --

	import pandas as pd
	import numpy as np

	#read in csv of NER training data
	df = pd.read_csv("../data/ner_dataset.csv", encoding="ISO-8859-1")

	#rename columns
	df = df.rename(columns = {"Sentence #" : "utterance_id",
	"Word" : "token",
	"POS" : "POS_tag",
	"Tag" : "BILOU_tag"})

	#clean utterance_id column
	df.loc[:, "utterance_id"] = df["utterance_id"].str.replace('Sentence: ', '')

	#fill np.nan utterance ID's with the last valid entry
	df = df.fillna(method='ffill')
	df.loc[:, "utterance_id"] = df["utterance_id"].apply(int)

	#melt BILOU tags and tokens into an array per utterance
	df1 = df.groupby("utterance_id")["BILOU_tag"].apply(lambda x: np.array(x)).to_frame().reset_index()
	df2 = df.groupby("utterance_id")["token"].apply(lambda x: np.array(x)).to_frame().reset_index()
	df3 = df.groupby("utterance_id")["POS_tag"].apply(lambda x: np.array(x)).to_frame().reset_index()

	#join the results on utterance id
	df = df1.merge(df2.merge(df3, how = "left", on = "utterance_id"), how = "left", on = "utterance_id")

	#save the dataframe to a csv file
	df.to_pickle("../data/ner_data.pkl")