| #!/usr/bin/env bash |
| |
| ## Licensed to the Apache Software Foundation (ASF) under one |
| ## or more contributor license agreements. See the NOTICE file |
| ## distributed with this work for additional information |
| ## regarding copyright ownership. The ASF licenses this file |
| ## to you under the Apache License, Version 2.0 (the |
| ## "License"); you may not use this file except in compliance |
| ## with the License. You may obtain a copy of the License at |
| ## |
| ## http://www.apache.org/licenses/LICENSE-2.0 |
| ## |
| ## Unless required by applicable law or agreed to in writing, software |
| ## distributed under the License is distributed on an "AS IS" BASIS, |
| ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ## See the License for the specific language governing permissions and |
| ## limitations under the License. |
| |
| ## External environment variables. |
| ## JENA_CP |
| ## JENA_HOME |
| ## JVM_ARGS |
| |
| ## Programs used: |
| ## jq |
| ## sort |
| ## /usr/bin/gzip |
| |
| ## ======= Check environment |
| |
| function testForProgram() { |
| local CMD="$1" |
| type -p "$CMD" &> /dev/null |
| if [[ $? -ne 0 ]]; then |
| echo "Command $CMD not found" |
| return 1 |
| fi |
| } |
| |
| ## Test for "sort --parallel" (though without that, it's going to be slower! |
| function testSortParallel() { |
| set +e |
| sort --parallel=3 < /dev/null &>/dev/null |
| if [[ $? -ne 0 ]]; then |
| echo "No --parallel support in sort(1)" 2&>1 |
| exit 9 |
| fi |
| set -e |
| } |
| |
| JAVA="${JAVA:-java}" |
| |
| COMPLETE="yes" |
| for F in jq sort /usr/bin/gzip java |
| do |
| testForProgram "$F" |
| if [[ $? -ne 0 ]] ; then |
| COMPLETE="no" |
| fi |
| done |
| |
| if [[ $COMPLETE -ne "yes" ]] ; then |
| echo "One or more programs missing" 2&>1 |
| exit 9 |
| fi |
| |
| unset COMPLETE |
| |
| testSortParallel |
| |
| ## ======== Setup |
| |
| ## Environment variable TMPDIR is ignored. |
| ## it is often a small-ish area unsuitable for large temp files. |
| ## Use the --tmpdir flag |
| |
| TMPDIR= |
| |
| ## Format used in logging with date(1). |
| DATE="+%H:%M:%S" |
| |
| ## Functions. |
| # Usage: log "LEVEL" "MESSAGE" |
| function log() { |
| local LEVEL="$1" |
| local MSG="$2" |
| printf "$(date $DATE) %-5s %s\n" $LEVEL "$MSG" |
| } |
| |
| function debug() { |
| if [ $DEBUG = 1 ]; then |
| log "DEBUG" "$@" |
| fi |
| } |
| |
| function info() { |
| log "INFO" "$@" |
| } |
| |
| function warn() { |
| log "WARN" "$@" 1>&2 |
| } |
| |
| function error() { |
| log "ERROR" "$@" 1>&2 |
| } |
| |
| function abort() { |
| local EXIT="$1" |
| # Trick to check for numeric |
| # -eq only returns true if the value is integer equals |
| if [ "$EXIT" -eq "$EXIT" ]; then |
| shift |
| else |
| # Caller forgot to provide an exit code so use default of 1 |
| EXIT=1 |
| fi |
| # Log error and exit |
| echo "ERROR" "$@" 1>&2 |
| exit $EXIT |
| } |
| |
| function now() { |
| # Current time in seconds. Use +%s%3N for milliseconds. |
| date +%s |
| } |
| |
| function printUsage() { |
| cat <<EOF |
| $(basename $0) TDB2 Bulk Loader |
| |
| Usage: ${TDB_CMD} --loc <Directory> [--tmpdir=DIR] FILE ... |
| |
| Bulk loader for TDB2. |
| See https://jena.apache/org/documentation/tdb/tdb-xloader.html |
| |
| Environment variables: |
| |
| JENA_HOME |
| Location of the Jena installation. |
| This defaults to the area where this script is being run from. |
| |
| JENA_CP |
| Class path for running the java steps. |
| This defaults to \${JENA_HOME}/lib/\* |
| |
| JVM_ARGS |
| Arguments for the JVM. |
| This defaults to "-Xmx2G" |
| Do not set to all available RAM. |
| Increasing it does not make the loader faster. |
| |
| The temporary directory defaults to the datbase directory. |
| |
| EOF |
| } |
| |
| # Determine JENA_HOME |
| |
| function resolveLink() { |
| local NAME=$1 |
| |
| if [ -L "$NAME" ]; then |
| case "$OSTYPE" in |
| darwin*|bsd*) |
| # BSD style readlink behaves differently to GNU readlink |
| # Have to manually follow links |
| while [ -L "$NAME" ]; do |
| NAME=$(readlink -- "$NAME") |
| done |
| ;; |
| *) |
| # Assuming standard GNU readlink with -f for |
| # canonicalize |
| NAME=$(readlink -f -- "$NAME") |
| ;; |
| esac |
| fi |
| |
| echo "$NAME" |
| } |
| |
| # If JENA_HOME is empty and no classpath |
| if [ -z "$JENA_HOME" -a -z "$JENA_CP" ]; then |
| SCRIPT="$0" |
| # Catch common issue: script has been symlinked |
| if [ -L "$SCRIPT" ]; then |
| SCRIPT=$(resolveLink "$0") |
| # If link is relative |
| case "$SCRIPT" in |
| /*) |
| # Already absolute |
| ;; |
| *) |
| # Relative, make absolute |
| SCRIPT=$( dirname "$0" )/$SCRIPT |
| ;; |
| esac |
| fi |
| |
| # Work out root from script location |
| JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )" |
| export JENA_HOME |
| fi |
| |
| # If JENA_HOME is a symbolic link need to resolve |
| if [ -L "${JENA_HOME}" ]; then |
| JENA_HOME=$(resolveLink "$JENA_HOME") |
| # If link is relative |
| case "$JENA_HOME" in |
| /*) |
| # Already absolute |
| ;; |
| *) |
| # Relative, make absolute |
| JENA_HOME=$(dirname "$JENA_HOME") |
| ;; |
| esac |
| export JENA_HOME |
| #echo "Resolved symbolic links for JENA_HOME to $JENA_HOME" |
| fi |
| |
| ## Classpath JENA_CP. |
| if [ -z "$JENA_CP" ] ; then |
| if [ -z "$JENA_HOME" ]; then |
| abort 1 "JENA_HOME is not set" |
| fi |
| JENA_CP="${JENA_HOME}"/lib/'*' |
| fi |
| |
| function exec_java() { |
| "$JAVA" $JVM_ARGS -cp "$JENA_CP" "$@" |
| local RC="$?" |
| if [[ $RC != 0 ]] ; then |
| error "$JAVA" $JVM_ARGS -cp "$JENA_CP" "$@" |
| abort 1 "Java process returned $RC" |
| fi |
| } |
| |
| while [ $# -gt 0 ] |
| do |
| ARG=$1 |
| ## --tmpdir |
| ## --loc|--location |
| ## --help |
| case $ARG in |
| -d|--debug) |
| # Debug Mode |
| shift |
| DEBUG=1 |
| ;; |
| -h|--help) |
| printUsage |
| exit 0 |
| ;; |
| --loc|-loc) |
| # Location space separated |
| shift |
| LOC="$1" |
| shift |
| ;; |
| -*loc=*) |
| # Location = separated |
| LOC=${ARG/-*loc=/} |
| shift |
| ;; |
| -tmpdir|--tmpdir) |
| # Workspace directory |
| shift |
| TMPDIR="$1" |
| shift |
| ;; |
| -*tmpdir=*) |
| TMPDIR=${ARG/-*tmpdir=/} |
| shift |
| ;; |
| --) |
| # Arguments separator |
| shift |
| break |
| ;; |
| -*) |
| abort 1 "Unrecognized option $ARG, if this was meant to be a data file separate options from data files with --" |
| ;; |
| *) |
| break |
| ;; |
| esac |
| done |
| |
| if [[ $# -eq 0 ]] ; then |
| abort 1 "No files to load" 1>&2 |
| fi |
| |
| DATAFILES="$@" |
| |
| if [[ -z $LOC ]] ; then |
| abort 1 "No directory name for the database" |
| fi |
| |
| [[ -z $TMPDIR ]] && TMPDIR=$LOC |
| export TMPDIR |
| |
| ## TDB1 / TDB2 |
| ## @@ |
| SYSTEM=TDB2 |
| case "$SYSTEM" in |
| "TDB2") |
| ## @@ |
| PKG=tdb2.xloader |
| ## @@ |
| CMD_LOAD_TERMS= |
| CMD_INGEST_DATA= |
| CMD_BUILD_INDEX= |
| |
| ;; |
| "TDB1") |
| ## Not ported. |
| abort 9 "System '$SYSTEM' not supported" |
| ## PKG=org.apache.jena.tdb.xloader |
| ## CMD_LOAD_TERMS="" |
| ## CMD_INGEST_DATA= |
| ## CMD_BUILD_INDEX= |
| ;; |
| *) |
| abort 2 "System '$SYSTEM' not recognized" |
| ;; |
| esac |
| |
| ## Don't mess up an existing database! |
| if [ -e "$LOC" ]; then |
| ## @@ Better |
| abort 3 "Directory $LOC already exists" |
| fi |
| |
| info "Setup:" |
| info " Database: $LOC" |
| info " Data: $DATAFILES" |
| info " TMPDIR: $TMPDIR" |
| |
| # Large heap not required. |
| JVM_ARGS="${JVM_ARGS:--Xmx2G}" |
| |
| ## Time point. |
| |
| TIME_START="$(now)" |
| |
| ## ======== Node table loading. |
| if [ "$SYSTEM" == "TDB2" ]; then |
| ## TDB2 only. |
| info |
| T="$(now)" |
| info "Load node table" |
| exec_java $PKG.CmdxBuildNodeTable --loc $LOC --tmpdir "$TMPDIR" $DATAFILES |
| TIME_NODE_TABLE=$(($(now)-$T)) |
| fi |
| |
| ## ======== Ingest data, creates workfiles |
| info |
| info "Ingest data" |
| T="$(now)" |
| exec_java $PKG.CmdxIngestData --loc $LOC --tmpdir "$TMPDIR" --triples "$TMPDIR/triples.tmp" --quads "$TMPDIR/quads.tmp" $DATAFILES |
| TIME_INGEST=$(($(now)-$T)) |
| |
| ## ======== Indexes |
| INFO="$TMPDIR/load.json" |
| |
| ## Bash assocative array |
| declare -A TIME_IDX |
| |
| function index() { |
| local IDX="$1" |
| info |
| info "Build $IDX" |
| local T="$(now)" |
| exec_java $PKG.CmdxBuildIndex --loc $LOC --tmpdir "$TMPDIR" --index $IDX \ |
| "$TMPDIR/triples.tmp" "$TMPDIR/quads.tmp" |
| local T_IDX=$(($(now)-$T)) |
| TIME_IDX[$IDX]=$T_IDX |
| } |
| |
| ## Decide which indexes to generate. |
| TRIPLES_DFT="SPO POS OSP" |
| QUADS_DFT="GSPO GPOS GOSP SPOG POSG OSPG" |
| |
| TRIPLES_IDX="${TRIPLES_IDX:-$TRIPLES_DFT}" |
| QUADS_IDX="${QUADS_IDX:-$QUADS_DFT}" |
| |
| if [ -e "$INFO" ] ; then |
| ## Skip a phase if there are no items to index. |
| TRIPLES="$(jq .triples < $INFO)" |
| QUADS="$(jq .quads < $INFO)" |
| if [[ $TRIPLES -eq 0 ]] ; then |
| TRIPLES_IDX="" |
| fi |
| if [[ $QUADS -eq 0 ]] ; then |
| QUADS_IDX="" |
| fi |
| fi |
| |
| ## ==== Triples |
| |
| for IDX in $TRIPLES_IDX ; do |
| index $IDX |
| done |
| |
| ## ==== Quads |
| |
| for IDX in $QUADS_IDX ; do |
| index $IDX |
| done |
| |
| ## ======== Finish |
| |
| ## Delete temp files. |
| ## rm -f "$TMPDIR"/triples.tmp* "$TMPDIR"/quads.tmp* |
| |
| info |
| TIME_FINISH="$(now)" |
| |
| ## ======== Reporting |
| TIME_TOTAL=$(($TIME_FINISH-$TIME_START)) |
| |
| ## Ingest |
| if [ -n "$TIME_NODE_TABLE" ]; then |
| info "Load node table = $TIME_NODE_TABLE seconds" |
| fi |
| |
| SECS=$TIME_TOTAL |
| TIME_HMS="$(printf '%02dh %02dm %02ds\n' $((SECS/3600)) $((SECS%3600/60)) $((SECS%60)))" |
| |
| info "Load ingest data = $TIME_INGEST seconds" |
| |
| ## Indexes |
| for IDX in $TRIPLES_IDX ; do |
| info "Build index ${IDX} = ${TIME_IDX[${IDX}]} seconds" |
| done |
| for IDX in $QUADS_IDX ; do |
| info "Build index ${IDX} = ${TIME_IDX[${IDX}]} seconds" |
| done |
| |
| ## Whole run |
| info "Overall $TIME_TOTAL seconds" |
| info "Overall $TIME_HMS" |
| |
| if [[ -e $INFO ]] |
| then |
| printf -v TRIPLES_STR "%'d" "$TRIPLES" |
| printf -v QUADS_STR "%'d" "$QUADS" |
| info "Triples loaded = $TRIPLES_STR" |
| info "Quads loaded = $QUADS_STR" |
| TUPLES=$(($TRIPLES+$QUADS)) |
| RATE=$(($TUPLES / $TIME_TOTAL)) |
| printf -v RATE_STR "%'d" "$RATE" |
| info "Overall Rate $RATE_STR tuples per second" |
| fi |