blob: 73f1dd7283357096542ab4cf4a1d421d9b6e72bf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
plugins {
id "java"
id "de.undercouch.download"
}
description = 'System for benchmarking Lucene'
dependencies {
implementation project(':lucene:core')
implementation project(':lucene:analysis:common')
implementation project(':lucene:facet')
implementation project(':lucene:highlighter')
implementation project(':lucene:queries')
implementation project(':lucene:spatial-extras')
implementation project(':lucene:queryparser')
implementation "org.apache.commons:commons-compress"
implementation "com.ibm.icu:icu4j"
implementation "org.locationtech.spatial4j:spatial4j"
implementation("net.sourceforge.nekohtml:nekohtml", {
exclude module: "xml-apis"
})
runtimeOnly project(':lucene:analysis:icu')
testImplementation project(':lucene:test-framework')
}
def tempDir = file("temp")
def workDir = file("work")
task run(type: JavaExec) {
description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
main 'org.apache.lucene.benchmark.byTask.Benchmark'
classpath sourceSets.main.runtimeClasspath
// allow these to be specified on the CLI via -PtaskAlg= for example
args = [propertyOrDefault('taskAlg', 'conf/micro-standard.alg')]
maxHeapSize = propertyOrDefault('maxHeapSize', '1G')
String stdOutStr = propertyOrDefault('standardOutput', null)
if (stdOutStr != null) {
standardOutput = new File(stdOutStr).newOutputStream()
}
debugOptions {
enabled = false
port = 5005
suspend = true
}
}
/* Old "collation" Ant target:
gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
*/
/* Old "shingle" Ant target:
gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
*/
// The remaining tasks just get / extract / prepare data
task getEnWiki(type: Download) {
def finalName = "enwiki-20070527-pages-articles.xml"
src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
dest file("$tempDir/" + finalName + ".bz2")
overwrite false
compress false
doLast {
ant.bunzip2(src: dest, dest: tempDir)
}
outputs.file file("$tempDir/$finalName")
}
task getGeoNames(type: Download) {
// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
// and then compress with: bzip2 -9 -k file_random.txt
def finalName = "geonames_20130921_randomOrder_allCountries.txt"
src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
dest file("$tempDir/" + finalName + ".bz2")
overwrite false
compress false
doLast {
ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
}
outputs.file file("$tempDir/$finalName")
}
task getTop100kWikiWordFiles(type: Download) {
src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
dest file("$tempDir/${src.file.split('/').last()}")
overwrite false
compress false
def finalPath = file("$workDir/top100k-out")
doLast {
project.sync {
from tarTree(dest) // defined above. Will decompress on the fly
into finalPath
}
}
outputs.dir finalPath
}
task getReuters(type: Download) {
// note: there is no HTTPS url and we don't care because this is merely test/perf data
src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
dest file("$tempDir/${src.file.split('/').last()}")
overwrite false
compress false
def untarPath = file("$workDir/reuters")
def finalPath = file("$workDir/reuters-out")
dependsOn sourceSets.main.runtimeClasspath
doLast {
project.sync {
from(tarTree(dest)) { // defined above. Will decompress on the fly
exclude '*.txt'
}
into untarPath
}
println "Extracting reuters to $finalPath"
finalPath.deleteDir() // necessary
// TODO consider porting ExtractReuters to groovy?
project.javaexec {
main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
classpath = sourceSets.main.runtimeClasspath
maxHeapSize = '1G'
args = [untarPath, finalPath]
}
}
outputs.dir finalPath
}