lucene/core/src/test/org/apache/lucene/util/makeEuroparlLineFile.py - lucene-solr - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import sys
 import glob
 import datetime
 import tarfile
 import re

 try:
   sys.argv.remove('-verbose')
   VERBOSE = True
 except ValueError:
   VERBOSE = False

 try:
   sys.argv.remove('-docPerParagraph')
   docPerParagraph = True
 except ValueError:
   docPerParagraph = False

 reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
 reTagOnly = re.compile('^<.*?>$')
 reNumberOnly = re.compile(r'^\d+\.?$')

 maxDoc = 0
 didEnglish = False

 def write(date, title, pending, fOut):
   global maxDoc
   body = ' '.join(pending).replace('\t', ' ').strip()
   if len(body) > 0:
     line = '%s\t%s\t%s\n' % (title, date, body)
     fOut.write(line)
     maxDoc += 1
     del pending[:]
     if VERBOSE:
       print len(body)

 def processTar(fileName, fOut):

   global didEnglish

   t = tarfile.open(fileName, 'r:gz')
   for ti in t:
     if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):

       tup = ti.name.split('/')
       lang = tup[1]
       year = int(tup[2][3:5])
       if year < 20:
         year += 2000
       else:
         year += 1900

       month = int(tup[2][6:8])
       day = int(tup[2][9:11])
       date = datetime.date(year=year, month=month, day=day)

       if VERBOSE:
         print
         print '%s: %s' % (ti.name, date)
       nextIsTitle = False
       title = None
       pending = []
       for line in t.extractfile(ti).readlines():
         line = line.strip()
         if reChapterOnly.match(line) is not None:
           if title is not None:
             write(date, title, pending, fOut)
           nextIsTitle = True
           continue
         if nextIsTitle:
           if not reNumberOnly.match(line) and not reTagOnly.match(line):
             title = line
             nextIsTitle = False
             if VERBOSE:
               print '  title %s' % line
           continue
         if line.lower() == '<p>':
           if docPerParagraph:
             write(date, title, pending, fOut)
           else:
             pending.append('PARSEP')
         elif not reTagOnly.match(line):
           pending.append(line)
       if title is not None and len(pending) > 0:
         write(date, title, pending, fOut)

   didEnglish = True

 # '/x/lucene/data/europarl/all.lines.txt'
 dirIn = sys.argv[1]
 fileOut = sys.argv[2]

 fOut = open(fileOut, 'wb')

 for fileName in glob.glob('%s/??-??.tgz' % dirIn):
   if fileName.endswith('.tgz'):
     print 'process %s; %d docs so far...' % (fileName, maxDoc)
     processTar(fileName, fOut)

 print 'TOTAL: %s' % maxDoc

 #run something like this:
 """

 # Europarl V5 makes 76,917 docs, avg 38.6 KB per
 python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
 shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
 rm /x/lucene/data/europarl/tmp.lines.txt

 # Run again, this time each paragraph is a doc:
 # Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
 python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
 shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
 rm /x/lucene/data/europarl/tmp.lines.txt

 # ~5.5 MB gzip'd:
 head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
 head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
 shuf tmp.txt > europarl.subset.txt
 rm -f tmp.txt
 gzip --best europarl.subset.txt
 """
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import sys
	import glob
	import datetime
	import tarfile
	import re

	try:
	sys.argv.remove('-verbose')
	VERBOSE = True
	except ValueError:
	VERBOSE = False

	try:
	sys.argv.remove('-docPerParagraph')
	docPerParagraph = True
	except ValueError:
	docPerParagraph = False

	reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
	reTagOnly = re.compile('^<.*?>$')
	reNumberOnly = re.compile(r'^\d+\.?$')

	maxDoc = 0
	didEnglish = False

	def write(date, title, pending, fOut):
	global maxDoc
	body = ' '.join(pending).replace('\t', ' ').strip()
	if len(body) > 0:
	line = '%s\t%s\t%s\n' % (title, date, body)
	fOut.write(line)
	maxDoc += 1
	del pending[:]
	if VERBOSE:
	print len(body)

	def processTar(fileName, fOut):

	global didEnglish

	t = tarfile.open(fileName, 'r:gz')
	for ti in t:
	if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):

	tup = ti.name.split('/')
	lang = tup[1]
	year = int(tup[2][3:5])
	if year < 20:
	year += 2000
	else:
	year += 1900

	month = int(tup[2][6:8])
	day = int(tup[2][9:11])
	date = datetime.date(year=year, month=month, day=day)

	if VERBOSE:
	print
	print '%s: %s' % (ti.name, date)
	nextIsTitle = False
	title = None
	pending = []
	for line in t.extractfile(ti).readlines():
	line = line.strip()
	if reChapterOnly.match(line) is not None:
	if title is not None:
	write(date, title, pending, fOut)
	nextIsTitle = True
	continue
	if nextIsTitle:
	if not reNumberOnly.match(line) and not reTagOnly.match(line):
	title = line
	nextIsTitle = False
	if VERBOSE:
	print ' title %s' % line
	continue
	if line.lower() == '<p>':
	if docPerParagraph:
	write(date, title, pending, fOut)
	else:
	pending.append('PARSEP')
	elif not reTagOnly.match(line):
	pending.append(line)
	if title is not None and len(pending) > 0:
	write(date, title, pending, fOut)

	didEnglish = True

	# '/x/lucene/data/europarl/all.lines.txt'
	dirIn = sys.argv[1]
	fileOut = sys.argv[2]

	fOut = open(fileOut, 'wb')

	for fileName in glob.glob('%s/??-??.tgz' % dirIn):
	if fileName.endswith('.tgz'):
	print 'process %s; %d docs so far...' % (fileName, maxDoc)
	processTar(fileName, fOut)

	print 'TOTAL: %s' % maxDoc

	#run something like this:
	"""

	# Europarl V5 makes 76,917 docs, avg 38.6 KB per
	python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
	shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
	rm /x/lucene/data/europarl/tmp.lines.txt

	# Run again, this time each paragraph is a doc:
	# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
	python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
	shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
	rm /x/lucene/data/europarl/tmp.lines.txt

	# ~5.5 MB gzip'd:
	head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
	head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
	shuf tmp.txt > europarl.subset.txt
	rm -f tmp.txt
	gzip --best europarl.subset.txt
	"""