blob: bd108a95ea4fa9c128f84646bc1d345f7f29d8c8 [file] [log] [blame]
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Parses {1} - an XML dump of a wiki as produced by http://download.wikimedia.org/ services - and produces
# a single .txt file for each article entry, saving them in folder {2}
# Txt file are named is the article name with few char replaces. An additional file is produces which contains the originl wiki URI according to $wprefix
# Eg
# Article_Name.txt
# Article_Name.uri
#
# Skips articles not in the 'main' namespace
#
#
rdom () { local IFS=\> ; read -d \< E C ;}
outdir=${2}
if [[ $outdir = "" ]]; then
outdir="./"
fi
outdir=`cd $outdir; pwd;`
wprefix=http://en.wikinews.org/wiki/
wtitle=""
wtext=""
wuri=""
while rdom; do
if [[ $E = title ]]; then
wtitle=$C
fi
#echo $E
if [[ $E == text\ * ]]; then
wtext=$C
fi
#echo "$wtitle $wtext"
if [[ $wtitle != "" && $wtext != "" ]]; then
# ignore all articles starting with a namespace, which means it is not a content article
if [[ $wtitle =~ \: ]]; then
echo Skipped $wtitle
else
title=`echo $wtitle|sed -e 's/[\ ]/_/g'`
fname=`echo $wtitle|sed -e 's/[\ \/\:]/_/g'`
wuri="$wprefix$title"
furi=$outdir/$fname.uri
fname=$outdir/$fname.txt
echo "Parsed: $wuri"
echo "$wtitle
$wtext">>"$fname"
echo "$wuri">>"$furi"
fi
wtitle=""
wtext=""
wuri=""
fi
done < ${1}