dev-tools/scripts/create_line_file_docs.py - lucene-solr - Git at Google

 import os
 import gzip
 import time
 import random
 import re
 import urllib.request
 import subprocess
 import tempfile
 import shutil

 DEBUG = False

 TARGET_DOC_CHARS = 1024

 def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):

   bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points

   seek_points = []

   if os.path.exists(file_name_out):
     os.remove(file_name_out)

   with open(file_name_in, 'rb') as f_in:

     f_out = None

     bytes_in_chunk = 0

     chunk_count = 0

     while True:
       if f_out is None:
         if os.path.exists(file_name_out):
           seek_points.append(os.path.getsize(file_name_out))
           print('  create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
         else:
           print('  create chunk %s at pos=0' % chunk_count)
         f_out = gzip.open(file_name_out, 'ab')
         chunk_count += 1

       line = f_in.readline()
       if len(line) == 0:
         break

       bytes_in_chunk += len(line)
       f_out.write(line)

       if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
         f_out.close()
         f_out = None
         bytes_in_chunk = 0

   with open(file_name_out[:-3] + '.seek', 'w') as f_out:
     for seek_point in seek_points:
       f_out.write('%d\n' % seek_point)

 re_tag = re.compile('<[^>]+?>')
 re_newlines = re.compile('\n+')
 re_space = re.compile('\s')

 # used to find word break, for splitting docs into ~1 KB sized smaller docs:
 re_next_non_word_character = re.compile('\W', re.U)

 EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'

 def split_docs(all_out, title_string, date_string, body_string):

   '''
   Splits docs into smallish (~1 KB) sized docs, repeating same title and date
   '''

   doc_count = 0
   while len(body_string) > 0:
     char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
     if char_count < 64:
       # trimmed normal?
       continue

     m = re_next_non_word_character.search(body_string, char_count)
     if m is not None:
       char_count = m.start(0)
     else:
       char_count = len(body_string)

     body_string_fragment = body_string[:char_count].strip()

     #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
     all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
     body_string = body_string[char_count:]
     doc_count += 1

   return doc_count

 def sample_europarl():

   # download europarl.tgz v7, if not already here (in cwd):
   file_name = 'europarl.tgz'
   if not os.path.exists(file_name):
     print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
     urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
     os.rename(file_name + '.tmp', file_name)
   else:
     print('%s already here; skipping download...' % file_name)

   if not DEBUG:
     tmp_dir_path = tempfile.mkdtemp()
   else:
     tmp_dir_path = '/tmp/tmp31ekzg75'
   print('Using tmp dir "%s"...' % tmp_dir_path)
   try:
     if not DEBUG:
       cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
       print('Run: %s' % cmd)
       subprocess.run(cmd, shell=True)

     doc_count = 0
     skip_count = 0
     file_count = 0

     all_txt_file_name = '%s/all.txt' % tmp_dir_path

     print('Extract text...')

     start_time = time.time()
     next_print_time = start_time + 3
     # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
     with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
       for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
         for file_name in file_names:
           if file_name.endswith('.txt'):
             file_count += 1

             year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
             if year >= 50:
               year = 1900 + year
             else:
               year = 2000 + year

             date_string = '%04d-%02d-%02d' % (year, month, day)

             # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
             chapter_count = 0
             with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
               last_text = []
               last_title = None
               while True:
                 line = f_in.readline()
                 if line == '':
                   break
                 line = line.strip()
                 if line.startswith('<CHAPTER '):
                   if last_title is not None:
                     s = ' '.join(last_text)
                     s = re_tag.sub(' ', s)
                     s = re_newlines.sub(' ', s)
                     s = s.strip()
                     if len(s) > 0:
                       doc_count += split_docs(all_out, last_title, date_string, s)
                     else:
                       skip_count += 1

                     last_text = []
                     chapter_count += 1
                   while True:
                     last_title = f_in.readline()
                     if last_title == '':
                       last_title = None
                       break
                     last_title = re_tag.sub(' ', last_title).strip()
                     if len(last_title) > 0:
                       break
                   continue
                 else:
                   last_text.append(line)

               if last_title is not None:
                 s = ' '.join(last_text)
                 s = re_tag.sub(' ', s)
                 s = re_newlines.sub(' ', s)
                 s = s.strip()
                 if len(s) > 0:
                   doc_count += split_docs(all_out, last_title, date_string, s)
                 else:
                   skip_count += 1
                 chapter_count += 1
               else:
                 skip_count += 1

               if chapter_count > 0:
                 #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
                 pass

             now = time.time()
             if now > next_print_time:
               print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
                     (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
                      100 * (file_count - skip_count) / file_count,
                      doc_count / 1000000, all_out.tell() / 1024/1024/1024))
               while next_print_time < now:
                 next_print_time += 3

     total_mb = os.path.getsize(all_txt_file_name)/1024/1024
     now = time.time()
     print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
           (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
            100 * (file_count - skip_count) / file_count,
            doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))

     print('Shuffle...')
     subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)

     for mb in (20, 200, 2000):
       print('Sample %d MB file...' % mb)
       file_name_out = '%dmb.txt' % mb
       with open(file_name_out, 'w', encoding='utf-8') as f_out:

         chance = mb / total_mb

         with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:

           while True:
             line = f.readline()
             if len(line) == 0:
               break
             if random.random() <= chance:
               f_out.write(line)

       print('  got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))

       compress_with_seek_points(file_name_out,
                                 file_name_out + '.gz',
                                 mb)

   finally:
     print('Removing tmp dir "%s"...' % tmp_dir_path)
     if not DEBUG:
       shutil.rmtree(tmp_dir_path)

   print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')

 if False:
   compress_with_seek_points('/x/tmp/europarl.lines.txt',
                             '/x/tmp/foo.txt.gz',
                             16)
 else:
   sample_europarl()
	import os
	import gzip
	import time
	import random
	import re
	import urllib.request
	import subprocess
	import tempfile
	import shutil

	DEBUG = False

	TARGET_DOC_CHARS = 1024

	def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):

	bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points

	seek_points = []

	if os.path.exists(file_name_out):
	os.remove(file_name_out)

	with open(file_name_in, 'rb') as f_in:

	f_out = None

	bytes_in_chunk = 0

	chunk_count = 0

	while True:
	if f_out is None:
	if os.path.exists(file_name_out):
	seek_points.append(os.path.getsize(file_name_out))
	print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
	else:
	print(' create chunk %s at pos=0' % chunk_count)
	f_out = gzip.open(file_name_out, 'ab')
	chunk_count += 1

	line = f_in.readline()
	if len(line) == 0:
	break

	bytes_in_chunk += len(line)
	f_out.write(line)

	if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
	f_out.close()
	f_out = None
	bytes_in_chunk = 0

	with open(file_name_out[:-3] + '.seek', 'w') as f_out:
	for seek_point in seek_points:
	f_out.write('%d\n' % seek_point)

	re_tag = re.compile('<[^>]+?>')
	re_newlines = re.compile('\n+')
	re_space = re.compile('\s')

	# used to find word break, for splitting docs into ~1 KB sized smaller docs:
	re_next_non_word_character = re.compile('\W', re.U)

	EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'

	def split_docs(all_out, title_string, date_string, body_string):

	'''
	Splits docs into smallish (~1 KB) sized docs, repeating same title and date
	'''

	doc_count = 0
	while len(body_string) > 0:
	char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
	if char_count < 64:
	# trimmed normal?
	continue

	m = re_next_non_word_character.search(body_string, char_count)
	if m is not None:
	char_count = m.start(0)
	else:
	char_count = len(body_string)

	body_string_fragment = body_string[:char_count].strip()

	#print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
	all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
	body_string = body_string[char_count:]
	doc_count += 1

	return doc_count

	def sample_europarl():

	# download europarl.tgz v7, if not already here (in cwd):
	file_name = 'europarl.tgz'
	if not os.path.exists(file_name):
	print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
	urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
	os.rename(file_name + '.tmp', file_name)
	else:
	print('%s already here; skipping download...' % file_name)

	if not DEBUG:
	tmp_dir_path = tempfile.mkdtemp()
	else:
	tmp_dir_path = '/tmp/tmp31ekzg75'
	print('Using tmp dir "%s"...' % tmp_dir_path)
	try:
	if not DEBUG:
	cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
	print('Run: %s' % cmd)
	subprocess.run(cmd, shell=True)

	doc_count = 0
	skip_count = 0
	file_count = 0

	all_txt_file_name = '%s/all.txt' % tmp_dir_path

	print('Extract text...')

	start_time = time.time()
	next_print_time = start_time + 3
	# normalize text a bit and concatenate all lines into single file, counting total lines/bytes
	with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
	for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
	for file_name in file_names:
	if file_name.endswith('.txt'):
	file_count += 1

	year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
	if year >= 50:
	year = 1900 + year
	else:
	year = 2000 + year

	date_string = '%04d-%02d-%02d' % (year, month, day)

	# unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
	chapter_count = 0
	with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
	last_text = []
	last_title = None
	while True:
	line = f_in.readline()
	if line == '':
	break
	line = line.strip()
	if line.startswith('<CHAPTER '):
	if last_title is not None:
	s = ' '.join(last_text)
	s = re_tag.sub(' ', s)
	s = re_newlines.sub(' ', s)
	s = s.strip()
	if len(s) > 0:
	doc_count += split_docs(all_out, last_title, date_string, s)
	else:
	skip_count += 1

	last_text = []
	chapter_count += 1
	while True:
	last_title = f_in.readline()
	if last_title == '':
	last_title = None
	break
	last_title = re_tag.sub(' ', last_title).strip()
	if len(last_title) > 0:
	break
	continue
	else:
	last_text.append(line)

	if last_title is not None:
	s = ' '.join(last_text)
	s = re_tag.sub(' ', s)
	s = re_newlines.sub(' ', s)
	s = s.strip()
	if len(s) > 0:
	doc_count += split_docs(all_out, last_title, date_string, s)
	else:
	skip_count += 1
	chapter_count += 1
	else:
	skip_count += 1

	if chapter_count > 0:
	#print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
	pass

	now = time.time()
	if now > next_print_time:
	print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
	(now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
	100 * (file_count - skip_count) / file_count,
	doc_count / 1000000, all_out.tell() / 1024/1024/1024))
	while next_print_time < now:
	next_print_time += 3

	total_mb = os.path.getsize(all_txt_file_name)/1024/1024
	now = time.time()
	print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
	(now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
	100 * (file_count - skip_count) / file_count,
	doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))

	print('Shuffle...')
	subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)

	for mb in (20, 200, 2000):
	print('Sample %d MB file...' % mb)
	file_name_out = '%dmb.txt' % mb
	with open(file_name_out, 'w', encoding='utf-8') as f_out:

	chance = mb / total_mb

	with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:

	while True:
	line = f.readline()
	if len(line) == 0:
	break
	if random.random() <= chance:
	f_out.write(line)

	print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))

	compress_with_seek_points(file_name_out,
	file_name_out + '.gz',
	mb)

	finally:
	print('Removing tmp dir "%s"...' % tmp_dir_path)
	if not DEBUG:
	shutil.rmtree(tmp_dir_path)

	print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')

	if False:
	compress_with_seek_points('/x/tmp/europarl.lines.txt',
	'/x/tmp/foo.txt.gz',
	16)
	else:
	sample_europarl()