blob: dadea54a77dce2021de9a6e9ba1f7977c7d0a858 [file] [log] [blame]
#!/usr/bin/env bash
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements. See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership. The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License. You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
# The environment for this sub-script is setup by "tdbloader2"
function resolveLink() {
local NAME=$1
if [ -L "$NAME" ]; then
case "$OSTYPE" in
darwin*|bsd*)
# BSD style readlink behaves differently to GNU readlink
# Have to manually follow links
while [ -L "$NAME" ]; do
NAME=$(readlink -- "$NAME")
done
;;
*)
# Assuming standard GNU readlink with -f for
# canonicalize
NAME=$(readlink -f -- "$NAME")
;;
esac
fi
echo "$NAME"
}
# Pull in common functions
if [ -z "$JENA_HOME" ]; then
echo "JENA_HOME is not set"
exit 1
fi
# If JENA_HOME is a symbolic link need to resolve
if [ -L "${JENA_HOME}" ]; then
JENA_HOME=$(resolveLink "$JENA_HOME")
# If link is relative
case "$JENA_HOME" in
/*)
# Already absolute
;;
*)
# Relative, make absolute
JENA_HOME=$(dirname "$JENA_HOME")
;;
esac
export JENA_HOME
echo "Resolved symbolic links for JENA_HOME to $JENA_HOME"
fi
if [ -e "${JENA_HOME}/bin/tdbloader2common" ]; then
# Can source common functions
source "${JENA_HOME}/bin/tdbloader2common"
else
echo "Unable to locate common functions script tdbloader2common"
exit 1
fi
function printUsage() {
cat << EOF
tdbloader2index - TDB Bulk Loader - Index Phase
Usage: tdbloader2index --loc <Directory> [Options]
Bulk Loader for TDB which generates the Index files based upon the
temporary data files generated by tdbloader2data. This command relies
on POSIX utilities so will only work on POSIX operating systems.
This command can only be used to create new database. If you wish to
bulk load to an existing database please use tdbloader instead.
Required options are as follows:
-l <DatabaseDirectory>
--loc <DatabaseDirectory>
Sets the location in which the database should be created.
This location must be a directory and must be empty, if a
non-existent path is specified it will be created as a new
directory.
Common additional options are as follows:
-h
--help
Prints this help summary and exits
Advanced additional options are as follows:
-d
--debug
Enable debug mode, adds extra debug output
-j <JvmArgs>
--jvm-args <JvmArgs>
Sets the arguments that should be passed to the JVM for the
JVM based portions of the build.
Generally it is best to not change these unless you have been
specifically advised to. The scripts will use appropriate
defaults if this is not specified.
In particular be careful increasing the heap size since many
parts of TDB actually use memory mapped files that live
outside the heap so if the heap is too large the heap may
conflict with the memory mapped files for memory space.
-k
--keep-work
Keeps the temporary work files around after they are no longer
needed. May be useful for debugging.
-s <SortArgs>
--sort-args <SortArgs>
Sets the arguments that should be passed to sort for the sort
based portions of the build.
Generally it is best not to change these as the scripts will
use appropriate defaults for your system.
-t
--trace
Enable trace mode, essentially sets -x within the scripts
EOF
}
# Exit on error.
set -e
# Sort order is ASCII
export LC_ALL="C"
# Process Arguments
LOC=
KEEP_WORK=0
DEBUG=0
JVM_ARGS=
SORT_ARGS="${SORT_ARGS:-}"
if [ -n "$SORT_ARGS" ]; then
echo "Using SORT_ARGS: $SORT_ARGS"
fi
while [ $# -gt 0 ]
do
ARG=$1
case "$ARG" in
-d|--debug)
# Debug Mode
shift
DEBUG=1
;;
-h|--help)
printUsage
exit 0
;;
-j|--jvm-args)
# JVM Arguments
shift
JVM_ARGS="$1"
shift
;;
-k|--keep-work)
# Keep work files
shift
KEEP_WORK=1
;;
-l|--loc|-loc)
# Location space separated
shift
LOC="$1"
shift
;;
-*loc=*)
# Location = separated
LOC=${ARG/-*loc=/}
shift
;;
-s|--sort-args)
# Sort arguments
shift
SORT_ARGS=$1
shift
;;
-t|--trace)
# Trace mode
shift
set -x
;;
*)
# Additional options are not supported
abort 1 "Unrecognized option $ARG"
;;
esac
done
# Verify arguments
if [ -z "$LOC" ]; then
abort 1 "Required database location not specified"
fi
# Make LOC absolute
ABS_LOC=$(makeAbsolute "$LOC")
if [ "$ABS_LOC" != "$LOC" ]; then
LOC="$ABS_LOC"
debug "Absolute database location is $LOC"
fi
# Check location
if [ ! -e "$LOC" ]; then
abort 1 "Database location specified does not exist: $LOC"
fi
if [ ! -d "$LOC" ]; then
abort 1 "Database location is not a directory: $LOC"
fi
# Locate and check data text files
DATA_TRIPLES="$LOC/data-triples.tmp"
DATA_QUADS="$LOC/data-quads.tmp"
if [ ! -e "$DATA_TRIPLES" ]; then
abort 1 "No triples text file found in database location, please run the tdbloader2data script first"
fi
if [ ! -e "$DATA_QUADS" ]; then
abort 1 "No quads text file found in database location, please run the tdbloader2data script first"
fi
debug "Data text files are $DATA_TRIPLES and $DATA_QUADS"
# Prepare sort arguments
if [ -z "$SORT_ARGS" ]; then
SORT_ARGS="--buffer-size=50%"
if [[ "$SORT_ARGS" != *"--parallel="* ]]; then
# --parallel is not always available.
# Temporarily disable exit on error while we check for --parallel support
set +e
sort --parallel=3 < /dev/null 2>/dev/null
if [ $? = 0 ]; then
SORT_ARGS="$SORT_ARGS --parallel=3"
fi
set -e
fi
fi
# Prepare JVM arguments
JVM_ARGS=${JVM_ARGS:--Xmx1200M}
debug "JVM Arguments are $JVM_ARGS"
# Classpath set in "tdbloader2"
if [ -z "$JENA_CP" ]; then
abort 1 "Classpath not provided : set JENA_CP"
fi
debug "Jena Classpath is $JENA_CP"
# ---- Index intermediates
# All files are written S P O / G S P O columns per row but in different sort orders.
info "Index Building Phase"
# Check where we are storing temporary sort files
debug "Sort Arguments: $SORT_ARGS"
SORT_TEMP_DIR=
if [[ "$SORT_ARGS" == *"-T "* ]]; then
# Specified via -T argument
SORT_TEMP_DIR=(${SORT_ARGS/-T /})
SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
elif [[ "$SORT_ARGS" == *"--temporary-directory="* ]]; then
# Specified via --temporary-directory argument
SORT_TEMP_DIR=(${SORT_ARGS/--temporary-directory=/})
SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
else
# Using the system temp directory
SORT_TEMP_DIR="$TMPDIR"
fi
if [ -n "$SORT_TEMP_DIR" ]; then
# If we've figured out the sort temporary directory then check it
SORT_TEMP_DIR=$(makeAbsolute "$SORT_TEMP_DIR")
debug "Sort Temp Directory: $SORT_TEMP_DIR"
SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then
debug "Sort Temp Directory is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"
if [ "${SORT_DRIVE_INFO[2]}" -le 10 ]; then
warn "-----"
warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_INFO[0]} which only has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes) available"
warn "This may result in sort failures if the data to be indexed is large"
warn "-----"
fi
fi
fi
generate_index()
{
local KEYS="$1"
local DATA="$2"
local IDX=$3
local WORK="$LOC/$IDX-txt"
if [ ! -s "$DATA" ]; then
debug "Skipping Index $IDX as no relevant data to index"
return
fi
info "Creating Index $IDX"
# For various purposes we need to know the size of the input data
local SIZE=$(getSize "$DATA")
debug "Size of data to be sorted is $SIZE bytes"
# Verify that we have enough space to sort the data
# Firstly check that the output disk has sufficient space
local WORK_DRIVE_INFO=($(getDriveInfo "$LOC"))
if [ "${#WORK_DRIVE_INFO[@]}" -gt 0 ]; then
if [ "${SIZE}" -ge "${WORK_DRIVE_INFO[3]}" ]; then
# If there is insufficient disk space then we can abort now
abort 1 "Insufficient free space on database drive ${WORK_DRIVE_INFO[0]}, there are ${WORK_DRIVE_INFO[3]} bytes free but ${SIZE} bytes are required"
else
debug "Sufficient free space on database drive ${WORK_DRIVE_INFO[0]} to attempt sorting data file ${DATA} (${SIZE} bytes required from ${WORK_DRIVE_INFO[3]} bytes free)"
fi
fi
# Secondly check if there is enough space to sort in-memory or if sort may need to do an external sort
# We only issue warnings when the sort is likely to be external because there are various factors
# such as virtual memory and OS file caching that may complicate this
FREE_MEM=$(getFreeMem)
if [ "$FREE_MEM" -ge 0 ]; then
if [ "$SIZE" -ge "$FREE_MEM" ]; then
debug "Insufficient free memory to sort data in-memory, sort will need to perform an external sort using Temp Directory ${SORT_TEMP_DIR}"
# Check for disk space on temporary disk
if [ -n "${SORT_TEMP_DIR}" ]; then
SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then
if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then
warn "There may be insufficient for sort to perform an external sort using Temp Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)"
fi
fi
fi
else
debug "Should be sufficient free memory ($FREE_MEM bytes) for sort to be fully in-memory"
fi
else
debug "Unable to determine free memory on your OS, can't check whether sort will be in-memory or external sort using Temp Directory ${SORT_TEMP_DIR}"
fi
# Sort the input data
info "Sort $IDX"
debug "Sorting $DATA into work file $WORK"
sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
info "Sort $IDX Completed"
# Build into an index
info "Build $IDX"
rm -f "$LOC/$IDX.dat"
rm -f "$LOC/$IDX.idn"
java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
info "Build $IDX Completed"
# Remove work file unless keeping
if [ $KEEP_WORK = 0 ]; then
debug "Cleaning up work file $WORK"
rm "$WORK"
fi
}
K1="-k 1,1"
K2="-k 2,2"
K3="-k 3,3"
K4="-k 4,4"
generate_index "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
generate_index "$K2 $K3 $K1" "$DATA_TRIPLES" POS
generate_index "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
generate_index "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
generate_index "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
generate_index "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
generate_index "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
info "Index Building Phase Completed"
# ---- Clean up.
if [ $KEEP_WORK = 0 ]; then
debug "Cleaning up data files $DATA_TRIPLES and $DATA_QUADS"
rm -f "$DATA_TRIPLES" "$DATA_QUADS"
fi