example/gluon/tree_lstm/scripts/preprocess-sick.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 Preprocessing script for SICK data.

 """

 import os
 import glob

 def make_dirs(dirs):
     for d in dirs:
         if not os.path.exists(d):
             os.makedirs(d)

 def dependency_parse(filepath, cp='', tokenize=True):
     print('\nDependency parsing ' + filepath)
     dirpath = os.path.dirname(filepath)
     filepre = os.path.splitext(os.path.basename(filepath))[0]
     tokpath = os.path.join(dirpath, filepre + '.toks')
     parentpath = os.path.join(dirpath, filepre + '.parents')
     relpath =  os.path.join(dirpath, filepre + '.rels')
     tokenize_flag = '-tokenize - ' if tokenize else ''
     cmd = ('java -cp %s DependencyParse -tokpath %s -parentpath %s -relpath %s %s < %s'
         % (cp, tokpath, parentpath, relpath, tokenize_flag, filepath))
     os.system(cmd)

 def constituency_parse(filepath, cp='', tokenize=True):
     dirpath = os.path.dirname(filepath)
     filepre = os.path.splitext(os.path.basename(filepath))[0]
     tokpath = os.path.join(dirpath, filepre + '.toks')
     parentpath = os.path.join(dirpath, filepre + '.cparents')
     tokenize_flag = '-tokenize - ' if tokenize else ''
     cmd = ('java -cp %s ConstituencyParse -tokpath %s -parentpath %s %s < %s'
         % (cp, tokpath, parentpath, tokenize_flag, filepath))
     os.system(cmd)

 def build_vocab(filepaths, dst_path, lowercase=True):
     vocab = set()
     for filepath in filepaths:
         with open(filepath) as f:
             for line in f:
                 if lowercase:
                     line = line.lower()
                 vocab |= set(line.split())
     with open(dst_path, 'w') as f:
         for w in sorted(vocab):
             f.write(w + '\n')

 def split(filepath, dst_dir):
     with open(filepath) as datafile, \
          open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \
          open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile,  \
          open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \
          open(os.path.join(dst_dir, 'sim.txt'), 'w') as simfile:
             datafile.readline()
             for line in datafile:
                 i, a, b, sim, ent = line.strip().split('\t')
                 idfile.write(i + '\n')
                 afile.write(a + '\n')
                 bfile.write(b + '\n')
                 simfile.write(sim + '\n')

 def parse(dirpath, cp=''):
     dependency_parse(os.path.join(dirpath, 'a.txt'), cp=cp, tokenize=True)
     dependency_parse(os.path.join(dirpath, 'b.txt'), cp=cp, tokenize=True)
     constituency_parse(os.path.join(dirpath, 'a.txt'), cp=cp, tokenize=True)
     constituency_parse(os.path.join(dirpath, 'b.txt'), cp=cp, tokenize=True)

 if __name__ == '__main__':
     print('=' * 80)
     print('Preprocessing SICK dataset')
     print('=' * 80)

     base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
     data_dir = os.path.join(base_dir, 'data')
     sick_dir = os.path.join(data_dir, 'sick')
     lib_dir = os.path.join(base_dir, 'lib')
     train_dir = os.path.join(sick_dir, 'train')
     dev_dir = os.path.join(sick_dir, 'dev')
     test_dir = os.path.join(sick_dir, 'test')
     make_dirs([train_dir, dev_dir, test_dir])

     # java classpath for calling Stanford parser
     classpath = ':'.join([
         lib_dir,
         os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
         os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

     # split into separate files
     split(os.path.join(sick_dir, 'SICK_train.txt'), train_dir)
     split(os.path.join(sick_dir, 'SICK_trial.txt'), dev_dir)
     split(os.path.join(sick_dir, 'SICK_test_annotated.txt'), test_dir)

     # parse sentences
     parse(train_dir, cp=classpath)
     parse(dev_dir, cp=classpath)
     parse(test_dir, cp=classpath)

     # get vocabulary
     build_vocab(
         glob.glob(os.path.join(sick_dir, '*/*.toks')),
         os.path.join(sick_dir, 'vocab.txt'))
     build_vocab(
         glob.glob(os.path.join(sick_dir, '*/*.toks')),
         os.path.join(sick_dir, 'vocab-cased.txt'),
         lowercase=False)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	Preprocessing script for SICK data.

	"""

	import os
	import glob

	def make_dirs(dirs):
	for d in dirs:
	if not os.path.exists(d):
	os.makedirs(d)

	def dependency_parse(filepath, cp='', tokenize=True):
	print('\nDependency parsing ' + filepath)
	dirpath = os.path.dirname(filepath)
	filepre = os.path.splitext(os.path.basename(filepath))[0]
	tokpath = os.path.join(dirpath, filepre + '.toks')
	parentpath = os.path.join(dirpath, filepre + '.parents')
	relpath = os.path.join(dirpath, filepre + '.rels')
	tokenize_flag = '-tokenize - ' if tokenize else ''
	cmd = ('java -cp %s DependencyParse -tokpath %s -parentpath %s -relpath %s %s < %s'
	% (cp, tokpath, parentpath, relpath, tokenize_flag, filepath))
	os.system(cmd)

	def constituency_parse(filepath, cp='', tokenize=True):
	dirpath = os.path.dirname(filepath)
	filepre = os.path.splitext(os.path.basename(filepath))[0]
	tokpath = os.path.join(dirpath, filepre + '.toks')
	parentpath = os.path.join(dirpath, filepre + '.cparents')
	tokenize_flag = '-tokenize - ' if tokenize else ''
	cmd = ('java -cp %s ConstituencyParse -tokpath %s -parentpath %s %s < %s'
	% (cp, tokpath, parentpath, tokenize_flag, filepath))
	os.system(cmd)

	def build_vocab(filepaths, dst_path, lowercase=True):
	vocab = set()
	for filepath in filepaths:
	with open(filepath) as f:
	for line in f:
	if lowercase:
	line = line.lower()
	vocab \|= set(line.split())
	with open(dst_path, 'w') as f:
	for w in sorted(vocab):
	f.write(w + '\n')

	def split(filepath, dst_dir):
	with open(filepath) as datafile, \
	open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \
	open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile, \
	open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \
	open(os.path.join(dst_dir, 'sim.txt'), 'w') as simfile:
	datafile.readline()
	for line in datafile:
	i, a, b, sim, ent = line.strip().split('\t')
	idfile.write(i + '\n')
	afile.write(a + '\n')
	bfile.write(b + '\n')
	simfile.write(sim + '\n')

	def parse(dirpath, cp=''):
	dependency_parse(os.path.join(dirpath, 'a.txt'), cp=cp, tokenize=True)
	dependency_parse(os.path.join(dirpath, 'b.txt'), cp=cp, tokenize=True)
	constituency_parse(os.path.join(dirpath, 'a.txt'), cp=cp, tokenize=True)
	constituency_parse(os.path.join(dirpath, 'b.txt'), cp=cp, tokenize=True)

	if __name__ == '__main__':
	print('=' * 80)
	print('Preprocessing SICK dataset')
	print('=' * 80)

	base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
	data_dir = os.path.join(base_dir, 'data')
	sick_dir = os.path.join(data_dir, 'sick')
	lib_dir = os.path.join(base_dir, 'lib')
	train_dir = os.path.join(sick_dir, 'train')
	dev_dir = os.path.join(sick_dir, 'dev')
	test_dir = os.path.join(sick_dir, 'test')
	make_dirs([train_dir, dev_dir, test_dir])

	# java classpath for calling Stanford parser
	classpath = ':'.join([
	lib_dir,
	os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
	os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

	# split into separate files
	split(os.path.join(sick_dir, 'SICK_train.txt'), train_dir)
	split(os.path.join(sick_dir, 'SICK_trial.txt'), dev_dir)
	split(os.path.join(sick_dir, 'SICK_test_annotated.txt'), test_dir)

	# parse sentences
	parse(train_dir, cp=classpath)
	parse(dev_dir, cp=classpath)
	parse(test_dir, cp=classpath)

	# get vocabulary
	build_vocab(
	glob.glob(os.path.join(sick_dir, '/.toks')),
	os.path.join(sick_dir, 'vocab.txt'))
	build_vocab(
	glob.glob(os.path.join(sick_dir, '/.toks')),
	os.path.join(sick_dir, 'vocab-cased.txt'),
	lowercase=False)