blob: ed2b88d84325371ed1068546da49b553f0d966c2 [file] [log] [blame]
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INDEXING_JAR=`pwd`/target/org.apache.stanbol.entityhub.indexing.dbpedia-*-jar-with-dependencies.jar
WORKSPACE=/tmp/dbpedia-index
DBPEDIA=http://downloads.dbpedia.org/3.7
MAX_SORT_MEM=2G
# Turn on echoing and exit on error
set -x -e -o pipefail
# Ensure that the workspace exists
mkdir -p $WORKSPACE
# Create the folder structure under the workspace folder
cd $WORKSPACE
java -jar $INDEXING_JAR init
# Rank entities by popularity by counting the number of incoming links in the
# wikipedia graph: computing this takes around 2 hours
if [ ! -f $WORKSPACE/indexing/resources/incoming_links.txt ]
then
curl $DBPEDIA/en/page_links_en.nt.bz2 \
| bzcat \
| sed -e 's/.*<http\:\/\/dbpedia\.org\/resource\/\([^>]*\)> ./\1/' \
| sort -S $MAX_SORT_MEM \
| uniq -c \
| sort -nr -S $MAX_SORT_MEM > $WORKSPACE/indexing/resources/incoming_links.txt
fi
# Download the RDF dumps:
cd $WORKSPACE/indexing/resources/rdfdata
# General attributes for all entities
wget -c $DBPEDIA/dbpedia_3.7.owl.bz2
wget -c $DBPEDIA/en/instance_types_en.nt.bz2
wget -c $DBPEDIA/ar/labels_ar.nt.bz2
wget -c $DBPEDIA/en/labels_en.nt.bz2
wget -c $DBPEDIA/es/labels_es.nt.bz2
wget -c $DBPEDIA/fr/labels_fr.nt.bz2
wget -c $DBPEDIA/he/labels_he.nt.bz2
wget -c $DBPEDIA/it/labels_it.nt.bz2
wget -c $DBPEDIA/ja/labels_ja.nt.bz2
wget -c $DBPEDIA/ru/labels_ru.nt.bz2
wget -c $DBPEDIA/tr/labels_tr.nt.bz2
wget -c $DBPEDIA/zh/labels_zh.nt.bz2
wget -c $DBPEDIA/en/short_abstracts_en.nt.bz2
#wget -c $DBPEDIA/en/long_abstracts_en.not.bz2
# special handling of the image file that has 5 corrupted entries
if [ ! -f images_en.nt ]
then
wget -c $DBPEDIA/en/images_en.nt.bz2
bzcat images_en.nt.bz2 \
| sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' > images_en.nt
rm -f images_en.nt.bz2
fi
# same problem for german labels
if [ ! -f labels_de.nt ]
then
wget -c $DBPEDIA/de/labels_de.nt.bz2
bzcat labels_de.nt.bz2 \
| sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' > labels_de.nt
rm -f labels_de.nt.bz2
fi
# Type specific attributes
wget -c $DBPEDIA/en/geo_coordinates_en.nt.bz2
wget -c $DBPEDIA/en/persondata_en.nt.bz2
# Category information
#wget -c $DBPEDIA/en/category_labels_en.nt.bz2
#wget -c $DBPEDIA/en/skos_categories_en.nt.bz2
#wget -c $DBPEDIA/en/article_categories_en.nt.bz2
# Redirects
wget -c $DBPEDIA/en/redirects_en.nt.bz2
set +xe
# Instruction to launch the indexing
echo "Preparation & data fetch done: edit config in $WORKSPACE/indexing/config/"
echo "Then launch indexing command:"
echo "(cd $WORKSPACE && java -jar $INDEXING_JAR index)"