lucene/benchmark/build.gradle - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 plugins {
   id "java"
   id "de.undercouch.download"
 }

 description = 'System for benchmarking Lucene'

 dependencies {
   implementation project(':lucene:core')

   implementation project(':lucene:analysis:common')
   implementation project(':lucene:facet')
   implementation project(':lucene:highlighter')
   implementation project(':lucene:queries')
   implementation project(':lucene:spatial-extras')
   implementation project(':lucene:queryparser')

   implementation "org.apache.commons:commons-compress"
   implementation "com.ibm.icu:icu4j"
   implementation "org.locationtech.spatial4j:spatial4j"
   implementation("net.sourceforge.nekohtml:nekohtml", {
     exclude module: "xml-apis"
   })

   runtimeOnly project(':lucene:analysis:icu')

   testImplementation project(':lucene:test-framework')
 }

 def tempDir = file("temp")
 def workDir = file("work")

 task run(type: JavaExec) {
   description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
   main 'org.apache.lucene.benchmark.byTask.Benchmark'
   classpath sourceSets.main.runtimeClasspath
   // allow these to be specified on the CLI via -PtaskAlg=  for example
   args = [propertyOrDefault('taskAlg', 'conf/micro-standard.alg')]

   maxHeapSize = propertyOrDefault('maxHeapSize', '1G')

   String stdOutStr = propertyOrDefault('standardOutput', null)
   if (stdOutStr != null) {
     standardOutput = new File(stdOutStr).newOutputStream()
   }

   debugOptions {
     enabled = false
     port = 5005
     suspend = true
   }
 }

 /* Old "collation" Ant target:
 gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
 perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
  */

 /* Old "shingle" Ant target:
 gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
 perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
  */

 // The remaining tasks just get / extract / prepare data

 task getEnWiki(type: Download) {
   def finalName = "enwiki-20070527-pages-articles.xml"
   src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
   dest file("$tempDir/" + finalName + ".bz2")
   overwrite false
   compress false

   doLast {
     ant.bunzip2(src: dest, dest: tempDir)
   }
   outputs.file file("$tempDir/$finalName")
 }

 task getGeoNames(type: Download) {
   // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
   //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
   //       and then compress with: bzip2 -9 -k file_random.txt
   def finalName = "geonames_20130921_randomOrder_allCountries.txt"
   src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
   dest file("$tempDir/" + finalName + ".bz2")
   overwrite false
   compress false

   doLast {
     ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
   }
   outputs.file file("$tempDir/$finalName")
 }

 task getTop100kWikiWordFiles(type: Download) {
   src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
   dest file("$tempDir/${src.file.split('/').last()}")
   overwrite false
   compress false

   def finalPath = file("$workDir/top100k-out")

   doLast {
     project.sync {
       from tarTree(dest) // defined above.  Will decompress on the fly
       into finalPath
     }
   }
   outputs.dir finalPath
 }

 task getReuters(type: Download) {
   // note: there is no HTTPS url and we don't care because this is merely test/perf data
   src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
   dest file("$tempDir/${src.file.split('/').last()}")
   overwrite false
   compress false

   def untarPath = file("$workDir/reuters")
   def finalPath = file("$workDir/reuters-out")
   dependsOn sourceSets.main.runtimeClasspath

   doLast {
     project.sync {
       from(tarTree(dest)) { // defined above.  Will decompress on the fly
         exclude '*.txt'
       }
       into untarPath
     }
     println "Extracting reuters to $finalPath"
     finalPath.deleteDir() // necessary
     // TODO consider porting ExtractReuters to groovy?
     project.javaexec {
       main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
       classpath = sourceSets.main.runtimeClasspath
       maxHeapSize = '1G'
       args = [untarPath, finalPath]
     }
   }
   outputs.dir finalPath
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	plugins {
	id "java"
	id "de.undercouch.download"
	}

	description = 'System for benchmarking Lucene'

	dependencies {
	implementation project(':lucene:core')

	implementation project(':lucene:analysis:common')
	implementation project(':lucene:facet')
	implementation project(':lucene:highlighter')
	implementation project(':lucene:queries')
	implementation project(':lucene:spatial-extras')
	implementation project(':lucene:queryparser')

	implementation "org.apache.commons:commons-compress"
	implementation "com.ibm.icu:icu4j"
	implementation "org.locationtech.spatial4j:spatial4j"
	implementation("net.sourceforge.nekohtml:nekohtml", {
	exclude module: "xml-apis"
	})

	runtimeOnly project(':lucene:analysis:icu')

	testImplementation project(':lucene:test-framework')
	}

	def tempDir = file("temp")
	def workDir = file("work")

	task run(type: JavaExec) {
	description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
	main 'org.apache.lucene.benchmark.byTask.Benchmark'
	classpath sourceSets.main.runtimeClasspath
	// allow these to be specified on the CLI via -PtaskAlg= for example
	args = [propertyOrDefault('taskAlg', 'conf/micro-standard.alg')]

	maxHeapSize = propertyOrDefault('maxHeapSize', '1G')

	String stdOutStr = propertyOrDefault('standardOutput', null)
	if (stdOutStr != null) {
	standardOutput = new File(stdOutStr).newOutputStream()
	}

	debugOptions {
	enabled = false
	port = 5005
	suspend = true
	}
	}

	/* Old "collation" Ant target:
	gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
	perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
	*/

	/* Old "shingle" Ant target:
	gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
	perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
	*/

	// The remaining tasks just get / extract / prepare data

	task getEnWiki(type: Download) {
	def finalName = "enwiki-20070527-pages-articles.xml"
	src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
	dest file("$tempDir/" + finalName + ".bz2")
	overwrite false
	compress false

	doLast {
	ant.bunzip2(src: dest, dest: tempDir)
	}
	outputs.file file("$tempDir/$finalName")
	}

	task getGeoNames(type: Download) {
	// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
	// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
	// and then compress with: bzip2 -9 -k file_random.txt
	def finalName = "geonames_20130921_randomOrder_allCountries.txt"
	src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
	dest file("$tempDir/" + finalName + ".bz2")
	overwrite false
	compress false

	doLast {
	ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
	}
	outputs.file file("$tempDir/$finalName")
	}

	task getTop100kWikiWordFiles(type: Download) {
	src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
	dest file("$tempDir/${src.file.split('/').last()}")
	overwrite false
	compress false

	def finalPath = file("$workDir/top100k-out")

	doLast {
	project.sync {
	from tarTree(dest) // defined above. Will decompress on the fly
	into finalPath
	}
	}
	outputs.dir finalPath
	}

	task getReuters(type: Download) {
	// note: there is no HTTPS url and we don't care because this is merely test/perf data
	src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
	dest file("$tempDir/${src.file.split('/').last()}")
	overwrite false
	compress false

	def untarPath = file("$workDir/reuters")
	def finalPath = file("$workDir/reuters-out")
	dependsOn sourceSets.main.runtimeClasspath

	doLast {
	project.sync {
	from(tarTree(dest)) { // defined above. Will decompress on the fly
	exclude '*.txt'
	}
	into untarPath
	}
	println "Extracting reuters to $finalPath"
	finalPath.deleteDir() // necessary
	// TODO consider porting ExtractReuters to groovy?
	project.javaexec {
	main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
	classpath = sourceSets.main.runtimeClasspath
	maxHeapSize = '1G'
	args = [untarPath, finalPath]
	}
	}
	outputs.dir finalPath
	}