| #!/usr/bin/env python3 |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| import gzip |
| import os |
| import random |
| import re |
| import shutil |
| import subprocess |
| import tempfile |
| import time |
| import urllib.request |
| from io import TextIOWrapper |
| from pathlib import Path |
| |
| DEBUG = False |
| |
| TARGET_DOC_CHARS = 1024 |
| |
| |
| def compress_with_seek_points(file_name_in: str, file_name_out: str, num_seek_points: int): |
| bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points |
| |
| seek_points: list[int] = [] |
| |
| if os.path.exists(file_name_out): |
| os.remove(file_name_out) |
| |
| with open(file_name_in, "rb") as f_in: |
| f_out = None |
| |
| bytes_in_chunk = 0 |
| |
| chunk_count = 0 |
| |
| while True: |
| if f_out is None: |
| if os.path.exists(file_name_out): |
| seek_points.append(os.path.getsize(file_name_out)) |
| print(" create chunk %s at pos=%s" % (chunk_count, seek_points[-1])) |
| else: |
| print(" create chunk %s at pos=0" % chunk_count) |
| f_out = gzip.open(file_name_out, "ab") |
| chunk_count += 1 |
| |
| line = f_in.readline() |
| if len(line) == 0: |
| break |
| |
| bytes_in_chunk += len(line) |
| f_out.write(line) |
| |
| if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points: |
| f_out.close() |
| f_out = None |
| bytes_in_chunk = 0 |
| |
| with open(file_name_out[:-3] + ".seek", "w") as f_out: |
| f_out.writelines("%d\n" % seek_point for seek_point in seek_points) |
| |
| |
| re_tag = re.compile(r"<[^>]+?>") |
| re_newlines = re.compile(r"\n+") |
| re_space = re.compile(r"\s") |
| |
| # used to find word break, for splitting docs into ~1 KB sized smaller docs: |
| re_next_non_word_character = re.compile(r"\W", re.UNICODE) |
| |
| EUROPARL_V7_URL = "https://www.statmt.org/europarl/v7/europarl.tgz" |
| |
| |
| def split_docs(all_out: TextIOWrapper, title_string: str, date_string: str, body_string: str): |
| """Splits docs into smallish (~1 KB) sized docs, repeating same title and date""" |
| doc_count = 0 |
| while len(body_string) > 0: |
| char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS / 4)) |
| if char_count < 64: |
| # trimmed normal? |
| continue |
| |
| m = re_next_non_word_character.search(body_string, char_count) |
| if m is not None: |
| char_count = m.start(0) |
| else: |
| char_count = len(body_string) |
| |
| body_string_fragment = body_string[:char_count].strip() |
| |
| # print('write title %d, body %d' % (len(title_string), len(body_string_fragment))) |
| all_out.write("%s\t%s\t%s\n" % (title_string, date_string, body_string_fragment)) |
| body_string = body_string[char_count:] |
| doc_count += 1 |
| |
| return doc_count |
| |
| |
| def sample_europarl(): |
| # download europarl.tgz v7, if not already here (in cwd): |
| file_name = "europarl.tgz" |
| if not os.path.exists(file_name): |
| print("Download %s to %s..." % (EUROPARL_V7_URL, file_name)) |
| urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + ".tmp") |
| _ = Path(f"{file_name}.tmp").rename(file_name) |
| else: |
| print("%s already here; skipping download..." % file_name) |
| |
| if not DEBUG: |
| tmp_dir_path = tempfile.mkdtemp() |
| else: |
| tmp_dir_path = "/tmp/tmp31ekzg75" |
| print('Using tmp dir "%s"...' % tmp_dir_path) |
| try: |
| if not DEBUG: |
| cmd = "tar xzf %s -C %s" % (file_name, tmp_dir_path) |
| print("Run: %s" % cmd) |
| subprocess.run(cmd, shell=True, check=False) |
| |
| doc_count = 0 |
| skip_count = 0 |
| file_count = 0 |
| |
| all_txt_file_name = "%s/all.txt" % tmp_dir_path |
| |
| print("Extract text...") |
| |
| start_time = time.time() |
| next_print_time = start_time + 3 |
| # normalize text a bit and concatenate all lines into single file, counting total lines/bytes |
| with open(all_txt_file_name, "w", encoding="utf-8") as all_out: |
| for dir_path, _, file_names in os.walk("%s/txt" % tmp_dir_path): |
| for file_name in file_names: |
| if file_name.endswith(".txt"): |
| file_count += 1 |
| |
| year, month, day = (int(x) for x in file_name[3:-4].split("-")[:3]) |
| if year >= 50: |
| year = 1900 + year |
| else: |
| year = 2000 + year |
| |
| date_string = "%04d-%02d-%02d" % (year, month, day) |
| |
| # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8: |
| chapter_count = 0 |
| with open("%s/%s" % (dir_path, file_name), encoding="utf-8", errors="ignore") as f_in: |
| last_text: list[str] = [] |
| last_title = None |
| while True: |
| line = f_in.readline() |
| if line == "": |
| break |
| line = line.strip() |
| if line.startswith("<CHAPTER "): |
| if last_title is not None: |
| s = " ".join(last_text) |
| s = re_tag.sub(" ", s) |
| s = re_newlines.sub(" ", s) |
| s = s.strip() |
| if len(s) > 0: |
| doc_count += split_docs(all_out, last_title, date_string, s) |
| else: |
| skip_count += 1 |
| |
| last_text = [] |
| chapter_count += 1 |
| while True: |
| last_title = f_in.readline() |
| if last_title == "": |
| last_title = None |
| break |
| last_title = re_tag.sub(" ", last_title).strip() |
| if len(last_title) > 0: |
| break |
| continue |
| last_text.append(line) |
| |
| if last_title is not None: |
| s = " ".join(last_text) |
| s = re_tag.sub(" ", s) |
| s = re_newlines.sub(" ", s) |
| s = s.strip() |
| if len(s) > 0: |
| doc_count += split_docs(all_out, last_title, date_string, s) |
| else: |
| skip_count += 1 |
| chapter_count += 1 |
| else: |
| skip_count += 1 |
| |
| if chapter_count > 0: |
| # print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count)) |
| pass |
| |
| now = time.time() |
| if now > next_print_time: |
| print( |
| "%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB..." |
| % (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 100 * (file_count - skip_count) / file_count, doc_count / 1000000, all_out.tell() / 1024 / 1024 / 1024) |
| ) |
| while next_print_time < now: |
| next_print_time += 3 |
| |
| total_mb = os.path.getsize(all_txt_file_name) / 1024 / 1024 |
| now = time.time() |
| print( |
| "%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB..." |
| % ( |
| now - start_time, |
| (file_count - skip_count) / 1000, |
| file_count / 1000, |
| 100 * (file_count - skip_count) / file_count, |
| doc_count / 1000000, |
| os.path.getsize(all_txt_file_name) / 1024 / 1024 / 1024, |
| ) |
| ) |
| |
| print("Shuffle...") |
| subprocess.run("shuf %s > %s.shuffled" % (all_txt_file_name, all_txt_file_name), shell=True, check=False) |
| |
| for mb in (20, 200, 2000): |
| print("Sample %d MB file..." % mb) |
| file_name_out = "%dmb.txt" % mb |
| with open(file_name_out, "w", encoding="utf-8") as f_out: |
| chance = mb / total_mb |
| |
| with open(all_txt_file_name + ".shuffled", encoding="utf-8") as f: |
| while True: |
| line = f.readline() |
| if len(line) == 0: |
| break |
| if random.random() <= chance: |
| f_out.write(line) |
| |
| print(" got %.2f MB" % (os.path.getsize(file_name_out) / 1024 / 1024)) |
| |
| compress_with_seek_points(file_name_out, file_name_out + ".gz", mb) |
| |
| finally: |
| print('Removing tmp dir "%s"...' % tmp_dir_path) |
| if not DEBUG: |
| shutil.rmtree(tmp_dir_path) |
| |
| print("\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n") |
| |
| |
| if False: |
| compress_with_seek_points("/x/tmp/europarl.lines.txt", "/x/tmp/foo.txt.gz", 16) |
| else: |
| sample_europarl() |