gradle/datasets/external-datasets.gradle - lucene - Git at Google

 import org.apache.lucene.gradle.datasets.ExtractReuters

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // TODO: not sure whether this should live in benchmarks, but for now
 // let it be.
 configure(project(":lucene:benchmark")) {
   apply plugin: "java"
   apply plugin: "de.undercouch.download"

   ext {
     dataDir = file("data")
   }

   task getEnWiki(type: Download) {
     ext {
       name = "enwiki-20070527-pages-articles.xml"
       src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
       intermediate = file("${dataDir}/${name}.bz2")
       dst = file("${dataDir}/${name}")
     }

     outputs.file ext.dst

     src ext.src
     dest ext.intermediate
     overwrite false
     compress false

     doLast {
       logger.lifecycle("Decompressing ${ext.name}...")
       ant.bunzip2(src: ext.intermediate, dest: ext.dst)
     }
   }

   task getEnWikiRandomLines(type: Download) {
     ext {
       name = "enwiki.random.lines.txt"
       src = "https://home.apache.org/~mikemccand/${name}.bz2"
       intermediate = file("${dataDir}/${name}.bz2")
       dst = file("${dataDir}/${name}")
     }

     outputs.file ext.dst

     src ext.src
     dest ext.intermediate
     overwrite false
     compress false

     doLast {
       logger.lifecycle("Decompressing ${ext.name}...")
       ant.bunzip2(src: ext.intermediate, dest: ext.dst)
     }
   }

   task getGeoNames(type: Download) {
     // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
     //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
     //       and then compress with: bzip2 -9 -k file_random.txt
     ext {
       name = "geonames_20130921_randomOrder_allCountries.txt"
       src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
       intermediate = file("${dataDir}/${name}.bz2")
       dst = file("${dataDir}/${name}")
     }

     outputs.file ext.dst

     src ext.src
     dest ext.intermediate
     overwrite false
     compress false

     doLast {
       logger.lifecycle("Decompressing ${ext.name}...")
       ant.bunzip2(src: ext.intermediate, dest: ext.dst)
     }
   }

   task getTop100kWikiWordFiles(type: Download) {
     ext {
       name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
       src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
       intermediate = file("${dataDir}/${name}.bz2")
       dst = file("${dataDir}/${name}")
     }

     outputs.dir ext.dst

     src ext.src
     dest ext.intermediate
     overwrite false
     compress false

     doLast {
       logger.lifecycle("Decompressing ${ext.name}...")
       project.sync {
         from tarTree(ext.intermediate) // defined above. Will decompress on the fly
         into ext.dst
       }
     }
   }

   task getReuters(type: Download) {
     ext {
       name = "reuters21578"
       // note: there is no HTTPS url and we don't care because this is merely test/perf data
       src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
       intermediate = file("${dataDir}/${name}.tar.gz")
       dst = file("${dataDir}/${name}")
     }

     outputs.dir ext.dst

     src ext.src
     dest ext.intermediate
     overwrite false
     compress false

     doLast {
       def untarPath = file("$temporaryDir/reuters-untar")

       logger.lifecycle("Decompressing ${ext.name}...")
       project.sync {
         from(tarTree(intermediate)) {
           exclude '*.txt'
         }
         into untarPath
       }

       logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
       ext.dst.deleteDir()
       ExtractReuters.main(untarPath.toString(), ext.dst.toString())
     }
   }

   task downloadDatasets() {
     group "Data set download"
     description "Download all data sets."
   }

   [
       getEnWiki,
       getGeoNames,
       getTop100kWikiWordFiles,
       getReuters,
       getEnWikiRandomLines
   ].each { task ->
     task.group "Data set download"
     task.description "Download the ${task.ext.name} data set."

     downloadDatasets.dependsOn(task)

     task.doFirst {
       logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
     }
   }
 }
	import org.apache.lucene.gradle.datasets.ExtractReuters

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// TODO: not sure whether this should live in benchmarks, but for now
	// let it be.
	configure(project(":lucene:benchmark")) {
	apply plugin: "java"
	apply plugin: "de.undercouch.download"

	ext {
	dataDir = file("data")
	}

	task getEnWiki(type: Download) {
	ext {
	name = "enwiki-20070527-pages-articles.xml"
	src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
	intermediate = file("${dataDir}/${name}.bz2")
	dst = file("${dataDir}/${name}")
	}

	outputs.file ext.dst

	src ext.src
	dest ext.intermediate
	overwrite false
	compress false

	doLast {
	logger.lifecycle("Decompressing ${ext.name}...")
	ant.bunzip2(src: ext.intermediate, dest: ext.dst)
	}
	}

	task getEnWikiRandomLines(type: Download) {
	ext {
	name = "enwiki.random.lines.txt"
	src = "https://home.apache.org/~mikemccand/${name}.bz2"
	intermediate = file("${dataDir}/${name}.bz2")
	dst = file("${dataDir}/${name}")
	}

	outputs.file ext.dst

	src ext.src
	dest ext.intermediate
	overwrite false
	compress false

	doLast {
	logger.lifecycle("Decompressing ${ext.name}...")
	ant.bunzip2(src: ext.intermediate, dest: ext.dst)
	}
	}

	task getGeoNames(type: Download) {
	// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
	// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
	// and then compress with: bzip2 -9 -k file_random.txt
	ext {
	name = "geonames_20130921_randomOrder_allCountries.txt"
	src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
	intermediate = file("${dataDir}/${name}.bz2")
	dst = file("${dataDir}/${name}")
	}

	outputs.file ext.dst

	src ext.src
	dest ext.intermediate
	overwrite false
	compress false

	doLast {
	logger.lifecycle("Decompressing ${ext.name}...")
	ant.bunzip2(src: ext.intermediate, dest: ext.dst)
	}
	}

	task getTop100kWikiWordFiles(type: Download) {
	ext {
	name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
	src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
	intermediate = file("${dataDir}/${name}.bz2")
	dst = file("${dataDir}/${name}")
	}

	outputs.dir ext.dst

	src ext.src
	dest ext.intermediate
	overwrite false
	compress false

	doLast {
	logger.lifecycle("Decompressing ${ext.name}...")
	project.sync {
	from tarTree(ext.intermediate) // defined above. Will decompress on the fly
	into ext.dst
	}
	}
	}

	task getReuters(type: Download) {
	ext {
	name = "reuters21578"
	// note: there is no HTTPS url and we don't care because this is merely test/perf data
	src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
	intermediate = file("${dataDir}/${name}.tar.gz")
	dst = file("${dataDir}/${name}")
	}

	outputs.dir ext.dst

	src ext.src
	dest ext.intermediate
	overwrite false
	compress false

	doLast {
	def untarPath = file("$temporaryDir/reuters-untar")

	logger.lifecycle("Decompressing ${ext.name}...")
	project.sync {
	from(tarTree(intermediate)) {
	exclude '*.txt'
	}
	into untarPath
	}

	logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
	ext.dst.deleteDir()
	ExtractReuters.main(untarPath.toString(), ext.dst.toString())
	}
	}

	task downloadDatasets() {
	group "Data set download"
	description "Download all data sets."
	}

	[
	getEnWiki,
	getGeoNames,
	getTop100kWikiWordFiles,
	getReuters,
	getEnWikiRandomLines
	].each { task ->
	task.group "Data set download"
	task.description "Download the ${task.ext.name} data set."

	downloadDatasets.dependsOn(task)

	task.doFirst {
	logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
	}
	}
	}