blob: 73f1dd7283357096542ab4cf4a1d421d9b6e72bf [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
plugins {
id "java"
id ""
description = 'System for benchmarking Lucene'
dependencies {
implementation project(':lucene:core')
implementation project(':lucene:analysis:common')
implementation project(':lucene:facet')
implementation project(':lucene:highlighter')
implementation project(':lucene:queries')
implementation project(':lucene:spatial-extras')
implementation project(':lucene:queryparser')
implementation "org.apache.commons:commons-compress"
implementation ""
implementation "org.locationtech.spatial4j:spatial4j"
implementation("net.sourceforge.nekohtml:nekohtml", {
exclude module: "xml-apis"
runtimeOnly project(':lucene:analysis:icu')
testImplementation project(':lucene:test-framework')
def tempDir = file("temp")
def workDir = file("work")
task run(type: JavaExec) {
description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
main 'org.apache.lucene.benchmark.byTask.Benchmark'
classpath sourceSets.main.runtimeClasspath
// allow these to be specified on the CLI via -PtaskAlg= for example
args = [propertyOrDefault('taskAlg', 'conf/micro-standard.alg')]
maxHeapSize = propertyOrDefault('maxHeapSize', '1G')
String stdOutStr = propertyOrDefault('standardOutput', null)
if (stdOutStr != null) {
standardOutput = new File(stdOutStr).newOutputStream()
debugOptions {
enabled = false
port = 5005
suspend = true
/* Old "collation" Ant target:
gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
perl -CSD scripts/ work/collation.benchmark.output.txt
/* Old "shingle" Ant target:
gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
perl -CSD scripts/ work/shingle.benchmark.output.txt
// The remaining tasks just get / extract / prepare data
task getEnWiki(type: Download) {
def finalName = "enwiki-20070527-pages-articles.xml"
src "" + finalName + ".bz2"
dest file("$tempDir/" + finalName + ".bz2")
overwrite false
compress false
doLast {
ant.bunzip2(src: dest, dest: tempDir)
outputs.file file("$tempDir/$finalName")
task getGeoNames(type: Download) {
// note: latest data is at:
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
// and then compress with: bzip2 -9 -k file_random.txt
def finalName = "geonames_20130921_randomOrder_allCountries.txt"
src "" + finalName + ".bz2"
dest file("$tempDir/" + finalName + ".bz2")
overwrite false
compress false
doLast {
ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
outputs.file file("$tempDir/$finalName")
task getTop100kWikiWordFiles(type: Download) {
src ""
dest file("$tempDir/${src.file.split('/').last()}")
overwrite false
compress false
def finalPath = file("$workDir/top100k-out")
doLast {
project.sync {
from tarTree(dest) // defined above. Will decompress on the fly
into finalPath
outputs.dir finalPath
task getReuters(type: Download) {
// note: there is no HTTPS url and we don't care because this is merely test/perf data
src ""
dest file("$tempDir/${src.file.split('/').last()}")
overwrite false
compress false
def untarPath = file("$workDir/reuters")
def finalPath = file("$workDir/reuters-out")
dependsOn sourceSets.main.runtimeClasspath
doLast {
project.sync {
from(tarTree(dest)) { // defined above. Will decompress on the fly
exclude '*.txt'
into untarPath
println "Extracting reuters to $finalPath"
finalPath.deleteDir() // necessary
// TODO consider porting ExtractReuters to groovy?
project.javaexec {
main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
classpath = sourceSets.main.runtimeClasspath
maxHeapSize = '1G'
args = [untarPath, finalPath]
outputs.dir finalPath