blob: 2c23e6272c9ac71366c5597253c0155338b41989 [file] [log] [blame]
#!/usr/bin/env bash
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements. See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership. The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License. You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
## External environment variables.
## JENA_CP
## JENA_HOME
## JVM_ARGS
## Programs used:
## jq
## sort
## /usr/bin/gzip
## ======= Check environment
function testForProgram() {
local CMD="$1"
type -p "$CMD" &> /dev/null
if [[ $? -ne 0 ]]; then
echo "Command $CMD not found"
return 1
fi
}
## Test for "sort --parallel" (though without that, it's going to be slower!
function testSortParallel() {
set +e
sort --parallel=3 < /dev/null &>/dev/null
if [[ $? -ne 0 ]]; then
echo "No --parallel support in sort(1)" 2&>1
exit 9
fi
set -e
}
JAVA="${JAVA:-java}"
COMPLETE="yes"
for F in jq sort /usr/bin/gzip java
do
testForProgram "$F"
if [[ $? -ne 0 ]] ; then
COMPLETE="no"
fi
done
if [[ $COMPLETE -ne "yes" ]] ; then
echo "One or more programs missing" 2&>1
exit 9
fi
unset COMPLETE
testSortParallel
## ======== Setup
## Environment variable TMPDIR is ignored.
## it is often a small-ish area unsuitable for large temp files.
## Use the --tmpdir flag
TMPDIR=
## Format used in logging with date(1).
DATE="+%H:%M:%S"
## Functions.
# Usage: log "LEVEL" "MESSAGE"
function log() {
local LEVEL="$1"
local MSG="$2"
printf "$(date $DATE) %-5s %s\n" $LEVEL "$MSG"
}
function debug() {
if [ $DEBUG = 1 ]; then
log "DEBUG" "$@"
fi
}
function info() {
log "INFO" "$@"
}
function warn() {
log "WARN" "$@" 1>&2
}
function error() {
log "ERROR" "$@" 1>&2
}
function abort() {
local EXIT="$1"
# Trick to check for numeric
# -eq only returns true if the value is integer equals
if [ "$EXIT" -eq "$EXIT" ]; then
shift
else
# Caller forgot to provide an exit code so use default of 1
EXIT=1
fi
# Log error and exit
echo "ERROR" "$@" 1>&2
exit $EXIT
}
function now() {
# Current time in seconds. Use +%s%3N for milliseconds.
date +%s
}
function printUsage() {
cat <<EOF
$(basename $0) TDB2 Bulk Loader
Usage: ${TDB_CMD} --loc <Directory> [--tmpdir=DIR] FILE ...
Bulk loader for TDB2.
See https://jena.apache/org/documentation/tdb/tdb-xloader.html
Environment variables:
JENA_HOME
Location of the Jena installation.
This defaults to the area where this script is being run from.
JENA_CP
Class path for running the java steps.
This defaults to \${JENA_HOME}/lib/\*
JVM_ARGS
Arguments for the JVM.
This defaults to "-Xmx2G"
Do not set to all available RAM.
Increasing it does not make the loader faster.
The temporary directory defaults to the datbase directory.
EOF
}
# Determine JENA_HOME
function resolveLink() {
local NAME=$1
if [ -L "$NAME" ]; then
case "$OSTYPE" in
darwin*|bsd*)
# BSD style readlink behaves differently to GNU readlink
# Have to manually follow links
while [ -L "$NAME" ]; do
NAME=$(readlink -- "$NAME")
done
;;
*)
# Assuming standard GNU readlink with -f for
# canonicalize
NAME=$(readlink -f -- "$NAME")
;;
esac
fi
echo "$NAME"
}
# If JENA_HOME is empty and no classpath
if [ -z "$JENA_HOME" -a -z "$JENA_CP" ]; then
SCRIPT="$0"
# Catch common issue: script has been symlinked
if [ -L "$SCRIPT" ]; then
SCRIPT=$(resolveLink "$0")
# If link is relative
case "$SCRIPT" in
/*)
# Already absolute
;;
*)
# Relative, make absolute
SCRIPT=$( dirname "$0" )/$SCRIPT
;;
esac
fi
# Work out root from script location
JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )"
export JENA_HOME
fi
# If JENA_HOME is a symbolic link need to resolve
if [ -L "${JENA_HOME}" ]; then
JENA_HOME=$(resolveLink "$JENA_HOME")
# If link is relative
case "$JENA_HOME" in
/*)
# Already absolute
;;
*)
# Relative, make absolute
JENA_HOME=$(dirname "$JENA_HOME")
;;
esac
export JENA_HOME
#echo "Resolved symbolic links for JENA_HOME to $JENA_HOME"
fi
## Classpath JENA_CP.
if [ -z "$JENA_CP" ] ; then
if [ -z "$JENA_HOME" ]; then
abort 1 "JENA_HOME is not set"
fi
JENA_CP="${JENA_HOME}"/lib/'*'
fi
function exec_java() {
"$JAVA" $JVM_ARGS -cp "$JENA_CP" "$@"
local RC="$?"
if [[ $RC != 0 ]] ; then
error "$JAVA" $JVM_ARGS -cp "$JENA_CP" "$@"
abort 1 "Java process returned $RC"
fi
}
while [ $# -gt 0 ]
do
ARG=$1
## --tmpdir
## --loc|--location
## --help
case $ARG in
-d|--debug)
# Debug Mode
shift
DEBUG=1
;;
-h|--help)
printUsage
exit 0
;;
--loc|-loc)
# Location space separated
shift
LOC="$1"
shift
;;
-*loc=*)
# Location = separated
LOC=${ARG/-*loc=/}
shift
;;
-tmpdir|--tmpdir)
# Workspace directory
shift
TMPDIR="$1"
shift
;;
-*tmpdir=*)
TMPDIR=${ARG/-*tmpdir=/}
shift
;;
--)
# Arguments separator
shift
break
;;
-*)
abort 1 "Unrecognized option $ARG, if this was meant to be a data file separate options from data files with --"
;;
*)
break
;;
esac
done
if [[ $# -eq 0 ]] ; then
abort 1 "No files to load" 1>&2
fi
DATAFILES="$@"
if [[ -z $LOC ]] ; then
abort 1 "No directory name for the database"
fi
[[ -z $TMPDIR ]] && TMPDIR=$LOC
export TMPDIR
## TDB1 / TDB2
## @@
SYSTEM=TDB2
case "$SYSTEM" in
"TDB2")
## @@
PKG=tdb2.xloader
## @@
CMD_LOAD_TERMS=
CMD_INGEST_DATA=
CMD_BUILD_INDEX=
;;
"TDB1")
## Not ported.
abort 9 "System '$SYSTEM' not supported"
## PKG=org.apache.jena.tdb.xloader
## CMD_LOAD_TERMS=""
## CMD_INGEST_DATA=
## CMD_BUILD_INDEX=
;;
*)
abort 2 "System '$SYSTEM' not recognized"
;;
esac
## Don't mess up an existing database!
if [ -e "$LOC" ]; then
## @@ Better
abort 3 "Directory $LOC already exists"
fi
info "Setup:"
info " Database: $LOC"
info " Data: $DATAFILES"
info " TMPDIR: $TMPDIR"
# Large heap not required.
JVM_ARGS="${JVM_ARGS:--Xmx2G}"
## Time point.
TIME_START="$(now)"
## ======== Node table loading.
if [ "$SYSTEM" == "TDB2" ]; then
## TDB2 only.
info
T="$(now)"
info "Load node table"
exec_java $PKG.CmdxBuildNodeTable --loc $LOC --tmpdir "$TMPDIR" $DATAFILES
TIME_NODE_TABLE=$(($(now)-$T))
fi
## ======== Ingest data, creates workfiles
info
info "Ingest data"
T="$(now)"
exec_java $PKG.CmdxIngestData --loc $LOC --tmpdir "$TMPDIR" --triples "$TMPDIR/triples.tmp" --quads "$TMPDIR/quads.tmp" $DATAFILES
TIME_INGEST=$(($(now)-$T))
## ======== Indexes
INFO="$TMPDIR/load.json"
## Bash assocative array
declare -A TIME_IDX
function index() {
local IDX="$1"
info
info "Build $IDX"
local T="$(now)"
exec_java $PKG.CmdxBuildIndex --loc $LOC --tmpdir "$TMPDIR" --index $IDX \
"$TMPDIR/triples.tmp" "$TMPDIR/quads.tmp"
local T_IDX=$(($(now)-$T))
TIME_IDX[$IDX]=$T_IDX
}
## Decide which indexes to generate.
TRIPLES_DFT="SPO POS OSP"
QUADS_DFT="GSPO GPOS GOSP SPOG POSG OSPG"
TRIPLES_IDX="${TRIPLES_IDX:-$TRIPLES_DFT}"
QUADS_IDX="${QUADS_IDX:-$QUADS_DFT}"
if [ -e "$INFO" ] ; then
## Skip a phase if there are no items to index.
TRIPLES="$(jq .triples < $INFO)"
QUADS="$(jq .quads < $INFO)"
if [[ $TRIPLES -eq 0 ]] ; then
TRIPLES_IDX=""
fi
if [[ $QUADS -eq 0 ]] ; then
QUADS_IDX=""
fi
fi
## ==== Triples
for IDX in $TRIPLES_IDX ; do
index $IDX
done
## ==== Quads
for IDX in $QUADS_IDX ; do
index $IDX
done
## ======== Finish
## Delete temp files.
## rm -f "$TMPDIR"/triples.tmp* "$TMPDIR"/quads.tmp*
info
TIME_FINISH="$(now)"
## ======== Reporting
TIME_TOTAL=$(($TIME_FINISH-$TIME_START))
## Ingest
if [ -n "$TIME_NODE_TABLE" ]; then
info "Load node table = $TIME_NODE_TABLE seconds"
fi
SECS=$TIME_TOTAL
TIME_HMS="$(printf '%02dh %02dm %02ds\n' $((SECS/3600)) $((SECS%3600/60)) $((SECS%60)))"
info "Load ingest data = $TIME_INGEST seconds"
## Indexes
for IDX in $TRIPLES_IDX ; do
info "Build index ${IDX} = ${TIME_IDX[${IDX}]} seconds"
done
for IDX in $QUADS_IDX ; do
info "Build index ${IDX} = ${TIME_IDX[${IDX}]} seconds"
done
## Whole run
info "Overall $TIME_TOTAL seconds"
info "Overall $TIME_HMS"
if [[ -e $INFO ]]
then
printf -v TRIPLES_STR "%'d" "$TRIPLES"
printf -v QUADS_STR "%'d" "$QUADS"
info "Triples loaded = $TRIPLES_STR"
info "Quads loaded = $QUADS_STR"
TUPLES=$(($TRIPLES+$QUADS))
RATE=$(($TUPLES / $TIME_TOTAL))
printf -v RATE_STR "%'d" "$RATE"
info "Overall Rate $RATE_STR tuples per second"
fi