blob: abbcc5fac84408f85c70a5123422d1a17e183b6b [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Preprocessing script for SICK data.
"""
import os
import glob
def make_dirs(dirs):
for d in dirs:
if not os.path.exists(d):
os.makedirs(d)
def dependency_parse(filepath, cp='', tokenize=True):
print('\nDependency parsing ' + filepath)
dirpath = os.path.dirname(filepath)
filepre = os.path.splitext(os.path.basename(filepath))[0]
tokpath = os.path.join(dirpath, filepre + '.toks')
parentpath = os.path.join(dirpath, filepre + '.parents')
relpath = os.path.join(dirpath, filepre + '.rels')
tokenize_flag = '-tokenize - ' if tokenize else ''
cmd = ('java -cp %s DependencyParse -tokpath %s -parentpath %s -relpath %s %s < %s'
% (cp, tokpath, parentpath, relpath, tokenize_flag, filepath))
os.system(cmd)
def constituency_parse(filepath, cp='', tokenize=True):
dirpath = os.path.dirname(filepath)
filepre = os.path.splitext(os.path.basename(filepath))[0]
tokpath = os.path.join(dirpath, filepre + '.toks')
parentpath = os.path.join(dirpath, filepre + '.cparents')
tokenize_flag = '-tokenize - ' if tokenize else ''
cmd = ('java -cp %s ConstituencyParse -tokpath %s -parentpath %s %s < %s'
% (cp, tokpath, parentpath, tokenize_flag, filepath))
os.system(cmd)
def build_vocab(filepaths, dst_path, lowercase=True):
vocab = set()
for filepath in filepaths:
with open(filepath) as f:
for line in f:
if lowercase:
line = line.lower()
vocab |= set(line.split())
with open(dst_path, 'w') as f:
for w in sorted(vocab):
f.write(w + '\n')
def split(filepath, dst_dir):
with open(filepath) as datafile, \
open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \
open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile, \
open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \
open(os.path.join(dst_dir, 'sim.txt'), 'w') as simfile:
datafile.readline()
for line in datafile:
i, a, b, sim, ent = line.strip().split('\t')
idfile.write(i + '\n')
afile.write(a + '\n')
bfile.write(b + '\n')
simfile.write(sim + '\n')
def parse(dirpath, cp=''):
dependency_parse(os.path.join(dirpath, 'a.txt'), cp=cp, tokenize=True)
dependency_parse(os.path.join(dirpath, 'b.txt'), cp=cp, tokenize=True)
constituency_parse(os.path.join(dirpath, 'a.txt'), cp=cp, tokenize=True)
constituency_parse(os.path.join(dirpath, 'b.txt'), cp=cp, tokenize=True)
if __name__ == '__main__':
print('=' * 80)
print('Preprocessing SICK dataset')
print('=' * 80)
base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
data_dir = os.path.join(base_dir, 'data')
sick_dir = os.path.join(data_dir, 'sick')
lib_dir = os.path.join(base_dir, 'lib')
train_dir = os.path.join(sick_dir, 'train')
dev_dir = os.path.join(sick_dir, 'dev')
test_dir = os.path.join(sick_dir, 'test')
make_dirs([train_dir, dev_dir, test_dir])
# java classpath for calling Stanford parser
classpath = ':'.join([
lib_dir,
os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])
# split into separate files
split(os.path.join(sick_dir, 'SICK_train.txt'), train_dir)
split(os.path.join(sick_dir, 'SICK_trial.txt'), dev_dir)
split(os.path.join(sick_dir, 'SICK_test_annotated.txt'), test_dir)
# parse sentences
parse(train_dir, cp=classpath)
parse(dev_dir, cp=classpath)
parse(test_dir, cp=classpath)
# get vocabulary
build_vocab(
glob.glob(os.path.join(sick_dir, '*/*.toks')),
os.path.join(sick_dir, 'vocab.txt'))
build_vocab(
glob.glob(os.path.join(sick_dir, '*/*.toks')),
os.path.join(sick_dir, 'vocab-cased.txt'),
lowercase=False)