| #!/usr/bin/env bash |
| |
| ## Licensed to the Apache Software Foundation (ASF) under one |
| ## or more contributor license agreements. See the NOTICE file |
| ## distributed with this work for additional information |
| ## regarding copyright ownership. The ASF licenses this file |
| ## to you under the Apache License, Version 2.0 (the |
| ## "License"); you may not use this file except in compliance |
| ## with the License. You may obtain a copy of the License at |
| ## |
| ## http://www.apache.org/licenses/LICENSE-2.0 |
| ## |
| ## Unless required by applicable law or agreed to in writing, software |
| ## distributed under the License is distributed on an "AS IS" BASIS, |
| ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ## See the License for the specific language governing permissions and |
| ## limitations under the License. |
| |
| # The environment for this sub-script is setup by "tdbloader2" |
| |
| function resolveLink() { |
| local NAME=$1 |
| |
| if [ -L "$NAME" ]; then |
| case "$OSTYPE" in |
| darwin*|bsd*) |
| # BSD style readlink behaves differently to GNU readlink |
| # Have to manually follow links |
| while [ -L "$NAME" ]; do |
| NAME=$(readlink -- "$NAME") |
| done |
| ;; |
| *) |
| # Assuming standard GNU readlink with -f for |
| # canonicalize |
| NAME=$(readlink -f -- "$NAME") |
| ;; |
| esac |
| fi |
| |
| echo "$NAME" |
| } |
| |
| # Pull in common functions |
| if [ -z "$JENA_HOME" ]; then |
| echo "JENA_HOME is not set" |
| exit 1 |
| fi |
| # If JENA_HOME is a symbolic link need to resolve |
| if [ -L "${JENA_HOME}" ]; then |
| JENA_HOME=$(resolveLink "$JENA_HOME") |
| # If link is relative |
| case "$JENA_HOME" in |
| /*) |
| # Already absolute |
| ;; |
| *) |
| # Relative, make absolute |
| JENA_HOME=$(dirname "$JENA_HOME") |
| ;; |
| esac |
| export JENA_HOME |
| echo "Resolved symbolic links for JENA_HOME to $JENA_HOME" |
| fi |
| |
| if [ -e "${JENA_HOME}/bin/tdbloader2common" ]; then |
| # Can source common functions |
| source "${JENA_HOME}/bin/tdbloader2common" |
| else |
| echo "Unable to locate common functions script tdbloader2common" |
| exit 1 |
| fi |
| |
| function printUsage() { |
| cat << EOF |
| tdbloader2index - TDB Bulk Loader - Index Phase |
| |
| Usage: tdbloader2index --loc <Directory> [Options] |
| |
| Bulk Loader for TDB which generates the Index files based upon the |
| temporary data files generated by tdbloader2data. This command relies |
| on POSIX utilities so will only work on POSIX operating systems. |
| |
| This command can only be used to create new database. If you wish to |
| bulk load to an existing database please use tdbloader instead. |
| |
| Required options are as follows: |
| |
| -l <DatabaseDirectory> |
| --loc <DatabaseDirectory> |
| Sets the location in which the database should be created. |
| |
| This location must be a directory and must be empty, if a |
| non-existent path is specified it will be created as a new |
| directory. |
| |
| Common additional options are as follows: |
| |
| -h |
| --help |
| Prints this help summary and exits |
| |
| Advanced additional options are as follows: |
| |
| -d |
| --debug |
| Enable debug mode, adds extra debug output |
| |
| -j <JvmArgs> |
| --jvm-args <JvmArgs> |
| Sets the arguments that should be passed to the JVM for the |
| JVM based portions of the build. |
| |
| Generally it is best to not change these unless you have been |
| specifically advised to. The scripts will use appropriate |
| defaults if this is not specified. |
| |
| In particular be careful increasing the heap size since many |
| parts of TDB actually use memory mapped files that live |
| outside the heap so if the heap is too large the heap may |
| conflict with the memory mapped files for memory space. |
| |
| -k |
| --keep-work |
| Keeps the temporary work files around after they are no longer |
| needed. May be useful for debugging. |
| |
| -s <SortArgs> |
| --sort-args <SortArgs> |
| Sets the arguments that should be passed to sort for the sort |
| based portions of the build. |
| |
| Generally it is best not to change these as the scripts will |
| use appropriate defaults for your system. |
| |
| -t |
| --trace |
| Enable trace mode, essentially sets -x within the scripts |
| EOF |
| } |
| |
| # Exit on error. |
| set -e |
| |
| # Sort order is ASCII |
| export LC_ALL="C" |
| |
| # Process Arguments |
| LOC= |
| KEEP_WORK=0 |
| DEBUG=0 |
| JVM_ARGS= |
| SORT_ARGS= |
| |
| while [ $# -gt 0 ] |
| do |
| ARG=$1 |
| case "$ARG" in |
| -d|--debug) |
| # Debug Mode |
| shift |
| DEBUG=1 |
| ;; |
| -h|--help) |
| printUsage |
| exit 0 |
| ;; |
| -j|--jvm-args) |
| # JVM Arguments |
| shift |
| JVM_ARGS="$1" |
| shift |
| ;; |
| -k|--keep-work) |
| # Keep work files |
| shift |
| KEEP_WORK=1 |
| ;; |
| -l|--loc|-loc) |
| # Location space separated |
| shift |
| LOC="$1" |
| shift |
| ;; |
| -*loc=*) |
| # Location = separated |
| LOC=${ARG/-*loc=/} |
| shift |
| ;; |
| -s|--sort-args) |
| # Sort arguments |
| shift |
| SORT_ARGS=$1 |
| shift |
| ;; |
| -t|--trace) |
| # Trace mode |
| shift |
| set -x |
| ;; |
| *) |
| # Additional options are not supported |
| abort 1 "Unrecognized option $ARG" |
| ;; |
| esac |
| done |
| |
| # Verify arguments |
| if [ -z "$LOC" ]; then |
| abort 1 "Required database location not specified" |
| fi |
| |
| # Make LOC absolute |
| ABS_LOC=$(makeAbsolute "$LOC") |
| if [ "$ABS_LOC" != "$LOC" ]; then |
| LOC="$ABS_LOC" |
| debug "Absolute database location is $LOC" |
| fi |
| |
| # Check location |
| if [ ! -e "$LOC" ]; then |
| abort 1 "Database location specified does not exist: $LOC" |
| fi |
| if [ ! -d "$LOC" ]; then |
| abort 1 "Database location is not a directory: $LOC" |
| fi |
| |
| # Locate and check data text files |
| DATA_TRIPLES="$LOC/data-triples.tmp" |
| DATA_QUADS="$LOC/data-quads.tmp" |
| |
| if [ ! -e "$DATA_TRIPLES" ]; then |
| abort 1 "No triples text file found in database location, please run the tdbloader2data script first" |
| fi |
| if [ ! -e "$DATA_QUADS" ]; then |
| abort 1 "No quads text file found in database location, please run the tdbloader2data script first" |
| fi |
| |
| debug "Data text files are $DATA_TRIPLES and $DATA_QUADS" |
| |
| # Prepare sort arguments |
| if [ -z "$SORT_ARGS" ]; then |
| SORT_ARGS="--buffer-size=50%" |
| |
| # --parallel is not always available. |
| # Temporarily disable exit on error while we check for --parallel support |
| set +e |
| sort --parallel=3 < /dev/null 2>/dev/null |
| if [ $? = 0 ]; then |
| SORT_ARGS="$SORT_ARGS --parallel=3" |
| fi |
| set -e |
| fi |
| |
| # Prepare JVM arguments |
| JVM_ARGS=${JVM_ARGS:--Xmx1200M} |
| debug "JVM Arguments are $JVM_ARGS" |
| |
| # Classpath set in "tdbloader2" |
| if [ -z "$JENA_CP" ]; then |
| abort 1 "Classpath not provided : set JENA_CP" |
| fi |
| debug "Jena Classpath is $JENA_CP" |
| |
| # ---- Index intermediates |
| # All files are written S P O / G S P O columns per row but in different sort orders. |
| info "Index Building Phase" |
| |
| # Check where we are storing temporary sort files |
| debug "Sort Arguments: $SORT_ARGS" |
| SORT_TEMP_DIR= |
| if [[ "$SORT_ARGS" == *"-T "* ]]; then |
| # Specified via -T argument |
| SORT_TEMP_DIR=(${SORT_ARGS/-T /}) |
| SORT_TEMP_DIR=${SORT_TEMP_DIR[0]} |
| elif [[ "$SORT_ARGS" == *"--temporary-directory="* ]]; then |
| # Specified via --temporary-directory argument |
| SORT_TEMP_DIR=(${SORT_ARGS/--temporary-directory=/}) |
| SORT_TEMP_DIR=${SORT_TEMP_DIR[0]} |
| else |
| # Using the system temp directory |
| SORT_TEMP_DIR="$TMPDIR" |
| fi |
| if [ -n "$SORT_TEMP_DIR" ]; then |
| # If we've figured out the sort temporary directory then check it |
| SORT_TEMP_DIR=$(makeAbsolute "$SORT_TEMP_DIR") |
| debug "Sort Temp Directory: $SORT_TEMP_DIR" |
| SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}")) |
| if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then |
| debug "Sort Temp Directory is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)" |
| |
| if [ "${SORT_DRIVE_INFO[2]}" -le 10 ]; then |
| warn "-----" |
| warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_INFO[0]} which only has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes) available" |
| warn "This may result in sort failures if the data to be indexed is large" |
| warn "-----" |
| fi |
| fi |
| fi |
| |
| generate_index() |
| { |
| local KEYS="$1" |
| local DATA="$2" |
| local IDX=$3 |
| local WORK="$LOC/$IDX-txt" |
| |
| if [ ! -s "$DATA" ]; then |
| debug "Skipping Index $IDX as no relevant data to index" |
| return |
| fi |
| |
| info "Creating Index $IDX" |
| |
| # For various purposes we need to know the size of the input data |
| local SIZE=$(getSize "$DATA") |
| debug "Size of data to be sorted is $SIZE bytes" |
| |
| # Verify that we have enough space to sort the data |
| |
| # Firstly check that the output disk has sufficient space |
| local WORK_DRIVE_INFO=($(getDriveInfo "$LOC")) |
| if [ "${#WORK_DRIVE_INFO[@]}" -gt 0 ]; then |
| if [ "${SIZE}" -ge "${WORK_DRIVE_INFO[3]}" ]; then |
| # If there is insufficient disk space then we can abort now |
| abort 1 "Insufficient free space on database drive ${WORK_DRIVE_INFO[0]}, there are ${WORK_DRIVE_INFO[3]} bytes free but ${SIZE} bytes are required" |
| else |
| debug "Sufficient free space on database drive ${WORK_DRIVE_INFO[0]} to attempt sorting data file ${DATA} (${SIZE} bytes required from ${WORK_DRIVE_INFO[3]} bytes free)" |
| fi |
| fi |
| |
| # Secondly check if there is enough space to sort in-memory or if sort may need to do an external sort |
| # We only issue warnings when the sort is likely to be external because there are various factors |
| # such as virtual memory and OS file caching that may complicate this |
| FREE_MEM=$(getFreeMem) |
| if [ "$FREE_MEM" -ge 0 ]; then |
| if [ "$SIZE" -ge "$FREE_MEM" ]; then |
| debug "Insufficient free memory to sort data in-memory, sort will need to perform an external sort using Temp Directory ${SORT_TEMP_DIR}" |
| |
| # Check for disk space on temporary disk |
| if [ -n "${SORT_TEMP_DIR}" ]; then |
| SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}")) |
| if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then |
| if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then |
| warn "There may be insufficient for sort to perform an external sort using Temp Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)" |
| fi |
| fi |
| fi |
| else |
| debug "Should be sufficient free memory ($FREE_MEM bytes) for sort to be fully in-memory" |
| fi |
| else |
| debug "Unable to determine free memory on your OS, can't check whether sort will be in-memory or external sort using Temp Directory ${SORT_TEMP_DIR}" |
| fi |
| |
| # Sort the input data |
| info "Sort $IDX" |
| debug "Sorting $DATA into work file $WORK" |
| sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK |
| info "Sort $IDX Completed" |
| |
| # Build into an index |
| info "Build $IDX" |
| rm -f "$LOC/$IDX.dat" |
| rm -f "$LOC/$IDX.idn" |
| java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK" |
| info "Build $IDX Completed" |
| |
| # Remove work file unless keeping |
| if [ $KEEP_WORK = 0 ]; then |
| debug "Cleaning up work file $WORK" |
| rm "$WORK" |
| fi |
| } |
| |
| K1="-k 1,1" |
| K2="-k 2,2" |
| K3="-k 3,3" |
| K4="-k 4,4" |
| |
| generate_index "$K1 $K2 $K3" "$DATA_TRIPLES" SPO |
| |
| generate_index "$K2 $K3 $K1" "$DATA_TRIPLES" POS |
| |
| generate_index "$K3 $K1 $K2" "$DATA_TRIPLES" OSP |
| |
| generate_index "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO |
| |
| generate_index "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS |
| |
| generate_index "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP |
| |
| generate_index "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG |
| |
| generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG |
| |
| generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG |
| |
| info "Index Building Phase Completed" |
| |
| # ---- Clean up. |
| if [ $KEEP_WORK = 0 ]; then |
| debug "Cleaning up data files $DATA_TRIPLES and $DATA_QUADS" |
| rm -f "$DATA_TRIPLES" "$DATA_QUADS" |
| fi |