blob: ba41f006861a78bac59a0747a3d4b1c0072f3715 [file] [log] [blame]
#!/usr/bin/env bash
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements. See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership. The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License. You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
# The environment for this sub-script is setup by "tdbloader2"
function resolveLink() {
local NAME=$1
if [ -L "$NAME" ]; then
case "$OSTYPE" in
darwin*|bsd*)
# BSD style readlink behaves differently to GNU readlink
# Have to manually follow links
while [ -L "$NAME" ]; do
NAME=$(readlink -- "$NAME")
done
;;
*)
# Assuming standard GNU readlink with -f for
# canonicalize
NAME=$(readlink -f -- "$NAME")
;;
esac
fi
echo "$NAME"
}
# Pull in common functions
if [ -z "$JENA_HOME" ]; then
echo "JENA_HOME is not set"
exit 1
fi
# If JENA_HOME is a symbolic link need to resolve
if [ -L "${JENA_HOME}" ]; then
JENA_HOME=$(resolveLink "$JENA_HOME")
# If link is relative
case "$JENA_HOME" in
/*)
# Already absolute
;;
*)
# Relative, make absolute
JENA_HOME=$(dirname "$JENA_HOME")
;;
esac
export JENA_HOME
echo "Resolved symbolic links for JENA_HOME to $JENA_HOME"
fi
if [ -e "${JENA_HOME}/bin/tdbloader2common" ]; then
# Can source common functions
source "${JENA_HOME}/bin/tdbloader2common"
else
echo "Unable to locate common functions script tdbloader2common"
exit 1
fi
function printUsage() {
cat << EOF
tdbloader2data - TDB Bulk Loader - Data Phase
Usage tdbloader2data --loc <Directory> [Options] <Data> ...
Bulk Loader for TDB which generates the Node Table. This command
relies on POSIX utilities so will only work on POSIX operating
systems.
This command can only be used to create new database. If you wish to
bulk load to an existing database please use tdbloader instead.
Required options are as follows:
-l <DatabaseDirectory>
--loc <DatabaseDirectory>
Sets the location in which the database should be created.
This location must be a directory and must be empty, if a
non-existent path is specified it will be created as a new
directory.
<Data>
Specifies the path to one/more data files to load
Common additional options are as follows:
-h
--help
Prints this help summary and exits
Advanced additional options are as follows:
-d
--debug
Enable debug mode, adds extra debug output
-j <JvmArgs>
--jvm-args <JvmArgs>
Sets the arguments that should be passed to the JVM for the
JVM based portions of the build.
Generally it is best to not change these unless you have been
specifically advised to. The scripts will use appropriate
defaults if this is not specified.
In particular be careful increasing the heap size since many
parts of TDB actually use memory mapped files that live
outside the heap so if the heap is too large the heap may
conflict with the memory mapped files for memory space.
-k
--keep-work
Keeps the temporary work files around after they are no longer
needed. May be useful for debugging.
-t
--trace
Enable trace mode, essentially sets -x within the scripts
EOF
}
# Exit on error.
set -e
# Process Arguments
LOC=
KEEP_WORK=0
DEBUG=0
while [ $# -gt 0 ]
do
ARG=$1
case "$ARG" in
-d|--debug)
# Debug Mode
shift
DEBUG=1
;;
-h|--help)
printUsage
exit 0
;;
-j|--jvm-args)
# JVM Arguments
shift
JVM_ARGS="$1"
shift
;;
-k|--keep-work)
# Keep work files
# This option is actually not used by this script but may be passed in
# by the parent tdbloader2 script
shift
KEEP_WORK=1
;;
-l|--loc|-loc)
# Location space separated
shift
LOC="$1"
shift
;;
-*loc=*)
# Location = separated
LOC=${ARG/-*loc=/}
shift
;;
-t|--trace)
# Trace mode
shift
set -x
;;
--)
# Arguments separator
# All further arguments are treated as data files
shift
break
;;
-*)
# Unrecognized
abort 1 "Unrecognized option $ARG, if this was meant to be a data file separate options from data files with --"
;;
*)
# Any further arguments are treated as data files
break
;;
esac
done
# Verify arguments
if [ -z "$LOC" ]; then
abort 1 "Required database location not specified"
fi
if [ $# = 0 ]; then
abort 1 "No data files specified, one/more data files must be specified"
fi
# Make LOC absolute
ABS_LOC=$(makeAbsolute "$LOC")
if [ "$ABS_LOC" != "$LOC" ]; then
LOC="$ABS_LOC"
debug "Absolute database location is $LOC"
fi
# Make sure LOC is a valid directory
if [ ! -e "$LOC" ] ; then
# If non-existent try to create
debug "Trying to create new database directory: $LOC"
mkdir "$LOC"
if [ $? != 0 ]; then
abort 1 "Failed to create new directory: $LOC"
fi
debug "New database directory created: $LOC"
fi
if [ ! -d "$LOC" ]; then
abort 1 "Database location is not a directory: $LOC"
fi
# Look for any index and data files in the directory.
# Skip a possible configuration file
if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
then
abort 1 "Database location is not empty: $LOC"
fi
# Prepare JVM Arguments
JVM_ARGS=${JVM_ARGS:--Xmx1200M}
debug "JVM Arguments are $JVM_ARGS"
# Classpath set in "tdbloader2"
if [ -z "$JENA_CP" ]; then
abort 1 "Classpath not provided : set JENA_CP"
fi
# ---- Data loading phase
info "Data Load Phase"
# Prepare Files
FILES=()
F=0
while [ $# -gt 0 ]; do
FILE=$1
shift
ABS_FILE=$(makeAbsolute "$FILE")
if [ "$FILE" != "$ABS_FILE" ]; then
# Relative path was resolved
FILES[$F]="$ABS_FILE"
debug "Relative data file $FILE was resolved to absolute data file $ABS_FILE"
else
# Already absolute
FILES[$F]="$FILE"
fi
F=$(($F + 1))
done
info "Got ${#FILES[@]} data files to load"
F=1
for file in ${FILES[@]}; do
info "Data file $F: $file"
F=$(($F + 1))
done
# Produce nodes file and triples/quads text file.
DATA_TRIPLES="$LOC/data-triples.tmp"
DATA_QUADS="$LOC/data-quads.tmp"
debug "Triples text files is $DATA_TRIPLES"
debug "Quads text file is $DATA_QUADS"
java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
"--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" -- "${FILES[@]}"
info "Data Load Phase Completed"