entityhub/indexing/dbpedia/fetch_prepare.sh - stanbol - Git at Google

 #!/usr/bin/env bash

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.


 INDEXING_JAR=`pwd`/target/org.apache.stanbol.entityhub.indexing.dbpedia-*-jar-with-dependencies.jar
 WORKSPACE=/tmp/dbpedia-index
 DBPEDIA=http://downloads.dbpedia.org/3.7
 MAX_SORT_MEM=2G

 # Turn on echoing and exit on error
 set -x -e -o pipefail

 # Ensure that the workspace exists
 mkdir -p $WORKSPACE

 # Create the folder structure under the workspace folder
 cd $WORKSPACE
 java -jar $INDEXING_JAR init

 # Rank entities by popularity by counting the number of incoming links in the
 # wikipedia graph: computing this takes around 2 hours
 if [ ! -f $WORKSPACE/indexing/resources/incoming_links.txt ]
 then
     curl $DBPEDIA/en/page_links_en.nt.bz2 \
         | bzcat \
         | sed -e 's/.*<http\:\/\/dbpedia\.org\/resource\/\([^>]*\)> ./\1/' \
         | sort -S $MAX_SORT_MEM \
         | uniq -c  \
         | sort -nr -S $MAX_SORT_MEM > $WORKSPACE/indexing/resources/incoming_links.txt
 fi

 # Download the RDF dumps:
 cd $WORKSPACE/indexing/resources/rdfdata

 # General attributes for all entities
 wget -c $DBPEDIA/dbpedia_3.7.owl.bz2
 wget -c $DBPEDIA/en/instance_types_en.nt.bz2
 wget -c $DBPEDIA/ar/labels_ar.nt.bz2
 wget -c $DBPEDIA/en/labels_en.nt.bz2
 wget -c $DBPEDIA/es/labels_es.nt.bz2
 wget -c $DBPEDIA/fr/labels_fr.nt.bz2
 wget -c $DBPEDIA/he/labels_he.nt.bz2
 wget -c $DBPEDIA/it/labels_it.nt.bz2
 wget -c $DBPEDIA/ja/labels_ja.nt.bz2
 wget -c $DBPEDIA/ru/labels_ru.nt.bz2
 wget -c $DBPEDIA/tr/labels_tr.nt.bz2
 wget -c $DBPEDIA/zh/labels_zh.nt.bz2
 wget -c $DBPEDIA/en/short_abstracts_en.nt.bz2
 #wget -c $DBPEDIA/en/long_abstracts_en.not.bz2

 # special handling of the image file that has 5 corrupted entries
 if [ ! -f images_en.nt ]
 then
     wget -c $DBPEDIA/en/images_en.nt.bz2
     bzcat images_en.nt.bz2 \
       | sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' > images_en.nt
     rm -f images_en.nt.bz2
 fi

 # same problem for german labels
 if [ ! -f labels_de.nt ]
 then
     wget -c $DBPEDIA/de/labels_de.nt.bz2
     bzcat labels_de.nt.bz2 \
       | sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' > labels_de.nt
     rm -f labels_de.nt.bz2
 fi

 # Type specific attributes
 wget -c $DBPEDIA/en/geo_coordinates_en.nt.bz2
 wget -c $DBPEDIA/en/persondata_en.nt.bz2

 # Category information
 #wget -c $DBPEDIA/en/category_labels_en.nt.bz2
 #wget -c $DBPEDIA/en/skos_categories_en.nt.bz2
 #wget -c $DBPEDIA/en/article_categories_en.nt.bz2

 # Redirects
 wget -c $DBPEDIA/en/redirects_en.nt.bz2

 set +xe

 # Instruction to launch the indexing
 echo "Preparation & data fetch done: edit config in $WORKSPACE/indexing/config/"
 echo "Then launch indexing command:"
 echo "(cd $WORKSPACE && java -jar $INDEXING_JAR index)"
	#!/usr/bin/env bash

	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	INDEXING_JAR=`pwd`/target/org.apache.stanbol.entityhub.indexing.dbpedia-*-jar-with-dependencies.jar
	WORKSPACE=/tmp/dbpedia-index
	DBPEDIA=http://downloads.dbpedia.org/3.7
	MAX_SORT_MEM=2G

	# Turn on echoing and exit on error
	set -x -e -o pipefail

	# Ensure that the workspace exists
	mkdir -p $WORKSPACE

	# Create the folder structure under the workspace folder
	cd $WORKSPACE
	java -jar $INDEXING_JAR init

	# Rank entities by popularity by counting the number of incoming links in the
	# wikipedia graph: computing this takes around 2 hours
	if [ ! -f $WORKSPACE/indexing/resources/incoming_links.txt ]
	then
	curl $DBPEDIA/en/page_links_en.nt.bz2 \
	\| bzcat \
	\| sed -e 's/.<http\:\/\/dbpedia\.org\/resource\/\([^>]\)> ./\1/' \
	\| sort -S $MAX_SORT_MEM \
	\| uniq -c \
	\| sort -nr -S $MAX_SORT_MEM > $WORKSPACE/indexing/resources/incoming_links.txt
	fi

	# Download the RDF dumps:
	cd $WORKSPACE/indexing/resources/rdfdata

	# General attributes for all entities
	wget -c $DBPEDIA/dbpedia_3.7.owl.bz2
	wget -c $DBPEDIA/en/instance_types_en.nt.bz2
	wget -c $DBPEDIA/ar/labels_ar.nt.bz2
	wget -c $DBPEDIA/en/labels_en.nt.bz2
	wget -c $DBPEDIA/es/labels_es.nt.bz2
	wget -c $DBPEDIA/fr/labels_fr.nt.bz2
	wget -c $DBPEDIA/he/labels_he.nt.bz2
	wget -c $DBPEDIA/it/labels_it.nt.bz2
	wget -c $DBPEDIA/ja/labels_ja.nt.bz2
	wget -c $DBPEDIA/ru/labels_ru.nt.bz2
	wget -c $DBPEDIA/tr/labels_tr.nt.bz2
	wget -c $DBPEDIA/zh/labels_zh.nt.bz2
	wget -c $DBPEDIA/en/short_abstracts_en.nt.bz2
	#wget -c $DBPEDIA/en/long_abstracts_en.not.bz2

	# special handling of the image file that has 5 corrupted entries
	if [ ! -f images_en.nt ]
	then
	wget -c $DBPEDIA/en/images_en.nt.bz2
	bzcat images_en.nt.bz2 \
	\| sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' > images_en.nt
	rm -f images_en.nt.bz2
	fi

	# same problem for german labels
	if [ ! -f labels_de.nt ]
	then
	wget -c $DBPEDIA/de/labels_de.nt.bz2
	bzcat labels_de.nt.bz2 \
	\| sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' > labels_de.nt
	rm -f labels_de.nt.bz2
	fi

	# Type specific attributes
	wget -c $DBPEDIA/en/geo_coordinates_en.nt.bz2
	wget -c $DBPEDIA/en/persondata_en.nt.bz2

	# Category information
	#wget -c $DBPEDIA/en/category_labels_en.nt.bz2
	#wget -c $DBPEDIA/en/skos_categories_en.nt.bz2
	#wget -c $DBPEDIA/en/article_categories_en.nt.bz2

	# Redirects
	wget -c $DBPEDIA/en/redirects_en.nt.bz2

	set +xe

	# Instruction to launch the indexing
	echo "Preparation & data fetch done: edit config in $WORKSPACE/indexing/config/"
	echo "Then launch indexing command:"
	echo "(cd $WORKSPACE && java -jar $INDEXING_JAR index)"