| #!/usr/bin/env bash |
| |
| ## Licensed to the Apache Software Foundation (ASF) under one |
| ## or more contributor license agreements. See the NOTICE file |
| ## distributed with this work for additional information |
| ## regarding copyright ownership. The ASF licenses this file |
| ## to you under the Apache License, Version 2.0 (the |
| ## "License"); you may not use this file except in compliance |
| ## with the License. You may obtain a copy of the License at |
| ## |
| ## http://www.apache.org/licenses/LICENSE-2.0 |
| ## |
| ## Unless required by applicable law or agreed to in writing, software |
| ## distributed under the License is distributed on an "AS IS" BASIS, |
| ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ## See the License for the specific language governing permissions and |
| ## limitations under the License. |
| |
| DATE="+%H:%M:%S" |
| # JENA_CP= |
| # JENA_HOME= |
| # TMPDIR= |
| |
| ## Functions. |
| # log "LEVEL" "MESSAGE" |
| function log() { |
| local LEVEL="$1" |
| local MSG="$2" |
| printf "$(date $DATE) %-5s %s\n" $LEVEL "$MSG" |
| } |
| |
| function debug() { |
| if [ $DEBUG = 1 ]; then |
| log "DEBUG" "$@" |
| fi |
| } |
| |
| function info() { |
| log "INFO" "$@" |
| } |
| |
| function warn() { |
| log "WARN" "$@" 1>&2 |
| } |
| |
| function error() { |
| log "ERROR" "$@" 1>&2 |
| } |
| |
| function abort() { |
| local EXIT="$1" |
| # Trick to check for numeric |
| # -eq only returns true if the value is integer equals |
| if [ "$EXIT" -eq "$EXIT" ]; then |
| shift |
| else |
| # Caller forgot to provide an exit code so use default of 1 |
| EXIT=1 |
| fi |
| # Log error and exit |
| echo "ERROR" "$@" 1>&2 |
| exit $EXIT |
| } |
| |
| function now() { |
| # Current time in seconds. Use +%s%3N for milliseconds. |
| date +%s |
| } |
| |
| function printUsage() { |
| cat <<EOF |
| $(basename $0) TDB2 Bulk Loader |
| |
| Usage: ${TDB_CMD} --loc <Directory> [--tmpdir=DIR] FILE ... |
| |
| Bulk loader for TDB2. |
| See https://jena.apache.org/documentation/tdb/tdb-xloader.html |
| |
| Environment variables: |
| |
| JENA_HOME |
| Location of the Jena installation. |
| This defaults to the area where this script is being run from. |
| |
| JENA_CP |
| Class path for running the java steps. |
| This defaults to \${JENA_HOME}/lib/\* |
| |
| JVM_ARGS |
| Arguments for the JVM. |
| This defaults to "-Xmx2G" |
| Do not set to all available RAM. |
| Increasing it does not make the loader faster. |
| |
| EOF |
| } |
| |
| # Determine JENA_HOME |
| |
| function resolveLink() { |
| local NAME=$1 |
| |
| if [ -L "$NAME" ]; then |
| case "$OSTYPE" in |
| darwin*|bsd*) |
| # BSD style readlink behaves differently to GNU readlink |
| # Have to manually follow links |
| while [ -L "$NAME" ]; do |
| NAME=$(readlink -- "$NAME") |
| done |
| ;; |
| *) |
| # Assuming standard GNU readlink with -f for |
| # canonicalize |
| NAME=$(readlink -f -- "$NAME") |
| ;; |
| esac |
| fi |
| |
| echo "$NAME" |
| } |
| |
| # If JENA_HOME is empty and no classpath |
| if [ -z "$JENA_HOME" -a -z "$JENA_CP" ]; then |
| SCRIPT="$0" |
| # Catch common issue: script has been symlinked |
| if [ -L "$SCRIPT" ]; then |
| SCRIPT=$(resolveLink "$0") |
| # If link is relative |
| case "$SCRIPT" in |
| /*) |
| # Already absolute |
| ;; |
| *) |
| # Relative, make absolute |
| SCRIPT=$( dirname "$0" )/$SCRIPT |
| ;; |
| esac |
| fi |
| |
| # Work out root from script location |
| JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )" |
| export JENA_HOME |
| fi |
| |
| # If JENA_HOME is a symbolic link need to resolve |
| if [ -L "${JENA_HOME}" ]; then |
| JENA_HOME=$(resolveLink "$JENA_HOME") |
| # If link is relative |
| case "$JENA_HOME" in |
| /*) |
| # Already absolute |
| ;; |
| *) |
| # Relative, make absolute |
| JENA_HOME=$(dirname "$JENA_HOME") |
| ;; |
| esac |
| export JENA_HOME |
| #echo "Resolved symbolic links for JENA_HOME to $JENA_HOME" |
| fi |
| |
| |
| ## Classpath JENA_CP. |
| if [ -z "$JENA_CP" ] ; then |
| if [ -z "$JENA_HOME" ]; then |
| abort 1 "JENA_HOME is not set" |
| fi |
| JENA_CP="${JENA_HOME}"/lib/'*' |
| fi |
| |
| function exec_java() { |
| "$JAVA" $JVM_ARGS -cp "$JENA_CP" "$@" |
| local RC="$?" |
| if [[ $RC != 0 ]] ; then |
| error "$JAVA" $JVM_ARGS -cp "$JENA_CP" "$@" |
| abort 1 "Java process returned $RC" |
| fi |
| } |
| |
| while [ $# -gt 0 ] |
| do |
| ARG=$1 |
| case $ARG in |
| -d|--debug) |
| # Debug Mode |
| shift |
| DEBUG=1 |
| ;; |
| -h|--help) |
| printUsage |
| exit 0 |
| ;; |
| --loc|-loc) |
| # Location space separated |
| shift |
| LOC="$1" |
| shift |
| ;; |
| -*loc=*) |
| # Location = separated |
| LOC=${ARG/-*loc=/} |
| shift |
| ;; |
| -tmpdir|--tmpdir) |
| # Workspace directory |
| shift |
| TMPDIR="$1" |
| shift |
| ;; |
| -*tmpdir=*) |
| TMPDIR=${ARG/-*tmpdir=/} |
| shift |
| ;; |
| --) |
| # Arguments separator |
| shift |
| break |
| ;; |
| -*) |
| abort 1 "Unrecognized option $ARG, if this was meant to be a data file separate options from data files with --" |
| ;; |
| *) |
| break |
| ;; |
| esac |
| done |
| |
| if [[ $# -eq 0 ]] |
| then |
| abort 1 "No files to load" 1>&2 |
| fi |
| |
| DATAFILES="$@" |
| |
| if [[ -z $LOC ]] ; then |
| abort 1 "No directory name for the database" |
| fi |
| |
| [[ -z $TMPDIR ]] && TMPDIR=$LOC |
| export TMPDIR |
| ## --tmpdir |
| ## --loc|--location |
| ## --help |
| |
| ## TDB1 / TDB2 |
| ## @@ |
| SYSTEM=TDB2 |
| case "$SYSTEM" in |
| "TDB2") |
| ## @@ |
| PKG=tdb2.xloader |
| ## @@ |
| CMD_LOAD_TERMS= |
| CMD_INGEST_DATA= |
| CMD_BUILD_INDEX= |
| |
| ;; |
| "TDB1") |
| ## Not ported. |
| abort 9 "System '$SYSTEM' not supported" |
| ## PKG=org.apache.jena.tdb.xloader |
| ## CMD_LOAD_TERMS="" |
| ## CMD_INGEST_DATA= |
| ## CMD_BUILD_INDEX= |
| ;; |
| *) |
| abort 2 "System '$SYSTEM' not recognized" |
| ;; |
| esac |
| |
| ## Delete database! |
| if [ -e "$LOC" ]; then |
| ## @@ Better |
| abort 3 "Directory $LOC already exists" |
| fi |
| |
| JAVA="${JAVA:-java}" |
| |
| info "Setup:" |
| info " Data: $DATAFILES" |
| info " Database: $LOC" |
| info " Tmpdir: $TMPDIR" |
| |
| # Large heap not required. |
| JVM_ARGS="${JVM_ARGS:--Xmx2G}" |
| |
| ## Time points. |
| |
| TIME_START="$(now)" |
| |
| ## Node table loading. |
| if [ "$SYSTEM" == "TDB2" ]; then |
| T="$(now)" |
| info "Load node table" |
| exec_java $PKG.CmdxBuildNodeTable --loc $LOC --tmpdir "$TMPDIR" $DATAFILES |
| TIME_NODE_TABLE=$(($(now)-$T)) |
| fi |
| |
| ## Ingest data, create workfiles |
| info |
| info "Ingest data" |
| T="$(now)" |
| exec_java $PKG.CmdxIngestData --loc $LOC --tmpdir "$TMPDIR" --triples "$TMPDIR/triples.tmp" --quads "$TMPDIR/quads.tmp" $DATAFILES |
| TIME_INGEST=$(($(now)-$T)) |
| |
| ## @@ triples.tmp quads.tmp |
| |
| function index() { |
| local IDX="$1" |
| exec_java $PKG.CmdxBuildIndex --loc $LOC --tmpdir "$TMPDIR" --index $IDX \ |
| "$TMPDIR/triples.tmp" "$TMPDIR/quads.tmp" |
| } |
| |
| info |
| info "Build SPO" |
| T="$(now)" |
| index SPO |
| TIME_IDX_SPO=$(($(now)-$T)) |
| |
| info |
| info "Build POS" |
| T="$(now)" |
| index POS |
| TIME_IDX_POS=$(($(now)-$T)) |
| |
| info |
| info "Build OSP" |
| T="$(now)" |
| index OSP |
| let TIME_IDX_OSP=$(($(now)-$T)) |
| |
| ## @@ |
| #rm "$TMPDIR/triples.tmp" "$TMPDIR/quads.tmp" |
| |
| TIME_FINISH="$(now)" |
| |
| TIME_TOTAL=$(($TIME_FINISH-$TIME_START)) |
| |
| if [ -n "$TIME_NODE_TABLE" ]; then |
| info "Load node table = $TIME_NODE_TABLE seconds" |
| fi |
| |
| SECS=$TIME_TOTAL |
| TIME_HMS="$(printf '%02dh %02dm %02ds\n' $((SECS/3600)) $((SECS%3600/60)) $((SECS%60)))" |
| |
| info "Load ingest data = $TIME_INGEST seconds" |
| info "Build index SPO = $TIME_IDX_SPO seconds" |
| info "Build index POS = $TIME_IDX_POS seconds" |
| info "Build index OSP = $TIME_IDX_OSP seconds" |
| info "Overall $TIME_TOTAL seconds" |
| info "Overall $TIME_HMS" |