| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.icu; |
| |
| |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.FileFilter; |
| import java.io.FileInputStream; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.OutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.Writer; |
| import java.net.URL; |
| import java.net.URLConnection; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt |
| * |
| * ASSUMPTION: This class will be run with current directory set to |
| * lucene/analysis/icu/src/data/utr30/ |
| * |
| * <ol> |
| * <li> |
| * Downloads nfc.txt, nfkc.txt and nfkc_cf.txt from icu-project.org, |
| * overwriting the versions in lucene/analysis/icu/src/data/utr30/. |
| * </li> |
| * <li> |
| * Converts round-trip mappings in nfc.txt (containing '=') |
| * that map to at least one [:Diacritic:] character |
| * into one-way mappings ('>' instead of '='). |
| * </li> |
| * </ol> |
| */ |
| public class GenerateUTR30DataFiles { |
| private static final String ICU_SVN_TAG_URL |
| = "http://source.icu-project.org/repos/icu/tags"; |
| private static final String ICU_RELEASE_TAG = "release-62-1"; |
| private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2"; |
| private static final String NFC_TXT = "nfc.txt"; |
| private static final String NFKC_TXT = "nfkc.txt"; |
| private static final String NFKC_CF_TXT = "nfkc_cf.txt"; |
| private static byte[] DOWNLOAD_BUFFER = new byte[8192]; |
| private static final Pattern ROUND_TRIP_MAPPING_LINE_PATTERN |
| = Pattern.compile("^\\s*([^=]+?)\\s*=\\s*(.*)$"); |
| private static final Pattern VERBATIM_RULE_LINE_PATTERN |
| = Pattern.compile("^#\\s*Rule:\\s*verbatim\\s*$", Pattern.CASE_INSENSITIVE); |
| private static final Pattern RULE_LINE_PATTERN |
| = Pattern.compile("^#\\s*Rule:\\s*(.*)>(.*)", Pattern.CASE_INSENSITIVE); |
| private static final Pattern BLANK_OR_COMMENT_LINE_PATTERN |
| = Pattern.compile("^\\s*(?:#.*)?$"); |
| private static final Pattern NUMERIC_VALUE_PATTERN |
| = Pattern.compile("Numeric[-\\s_]*Value", Pattern.CASE_INSENSITIVE); |
| |
| public static void main(String args[]) { |
| try { |
| getNFKCDataFilesFromIcuProject(); |
| expandRulesInUTR30DataFiles(); |
| } catch (Throwable t) { |
| t.printStackTrace(System.err); |
| System.exit(1); |
| } |
| } |
| |
| private static void expandRulesInUTR30DataFiles() throws IOException { |
| FileFilter filter = new FileFilter() { |
| @Override |
| public boolean accept(File pathname) { |
| String name = pathname.getName(); |
| return pathname.isFile() && name.matches(".*\\.(?s:txt)") |
| && ! name.equals(NFC_TXT) && ! name.equals(NFKC_TXT) |
| && ! name.equals(NFKC_CF_TXT); |
| } |
| }; |
| for (File file : new File(".").listFiles(filter)) { |
| expandDataFileRules(file); |
| } |
| } |
| |
| private static void expandDataFileRules(File file) throws IOException { |
| final FileInputStream stream = new FileInputStream(file); |
| final InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8); |
| final BufferedReader bufferedReader = new BufferedReader(reader); |
| StringBuilder builder = new StringBuilder(); |
| String line; |
| boolean verbatim = false; |
| boolean modified = false; |
| int lineNum = 0; |
| try { |
| while (null != (line = bufferedReader.readLine())) { |
| ++lineNum; |
| if (VERBATIM_RULE_LINE_PATTERN.matcher(line).matches()) { |
| verbatim = true; |
| builder.append(line).append("\n"); |
| } else { |
| Matcher ruleMatcher = RULE_LINE_PATTERN.matcher(line); |
| if (ruleMatcher.matches()) { |
| verbatim = false; |
| builder.append(line).append("\n"); |
| try { |
| String leftHandSide = ruleMatcher.group(1).trim(); |
| String rightHandSide = ruleMatcher.group(2).trim(); |
| expandSingleRule(builder, leftHandSide, rightHandSide); |
| } catch (IllegalArgumentException e) { |
| System.err.println |
| ("ERROR in " + file.getName() + " line #" + lineNum + ":"); |
| e.printStackTrace(System.err); |
| System.exit(1); |
| } |
| modified = true; |
| } else { |
| if (BLANK_OR_COMMENT_LINE_PATTERN.matcher(line).matches()) { |
| builder.append(line).append("\n"); |
| } else { |
| if (verbatim) { |
| builder.append(line).append("\n"); |
| } else { |
| modified = true; |
| } |
| } |
| } |
| } |
| } |
| } finally { |
| bufferedReader.close(); |
| } |
| if (modified) { |
| System.err.println("Expanding rules in and overwriting " + file.getName()); |
| final FileOutputStream out = new FileOutputStream(file, false); |
| Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8); |
| try { |
| writer.write(builder.toString()); |
| } finally { |
| writer.close(); |
| } |
| } |
| } |
| |
| private static void getNFKCDataFilesFromIcuProject() throws IOException { |
| URL icuTagsURL = new URL(ICU_SVN_TAG_URL + "/"); |
| URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/"); |
| URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/"); |
| |
| System.err.print("Downloading " + NFKC_TXT + " ... "); |
| download(new URL(norm2url, NFKC_TXT), NFKC_TXT); |
| System.err.println("done."); |
| System.err.print("Downloading " + NFKC_CF_TXT + " ... "); |
| download(new URL(norm2url, NFKC_CF_TXT), NFKC_CF_TXT); |
| System.err.println("done."); |
| |
| System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... "); |
| URLConnection connection = openConnection(new URL(norm2url, NFC_TXT)); |
| BufferedReader reader = new BufferedReader |
| (new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8)); |
| Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), StandardCharsets.UTF_8); |
| try { |
| String line; |
| |
| while (null != (line = reader.readLine())) { |
| Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line); |
| if (matcher.matches()) { |
| final String leftHandSide = matcher.group(1); |
| final String rightHandSide = matcher.group(2).trim(); |
| List<String> diacritics = new ArrayList<>(); |
| for (String outputCodePoint : rightHandSide.split("\\s+")) { |
| int ch = Integer.parseInt(outputCodePoint, 16); |
| if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC) |
| // gennorm2 fails if U+0653-U+0656 are included in round-trip mappings |
| || (ch >= 0x653 && ch <= 0x656)) { |
| diacritics.add(outputCodePoint); |
| } |
| } |
| if ( ! diacritics.isEmpty()) { |
| StringBuilder replacementLine = new StringBuilder(); |
| replacementLine.append(leftHandSide).append(">").append(rightHandSide); |
| replacementLine.append(" # one-way: diacritic"); |
| if (diacritics.size() > 1) { |
| replacementLine.append("s"); |
| } |
| for (String diacritic : diacritics) { |
| replacementLine.append(" ").append(diacritic); |
| } |
| line = replacementLine.toString(); |
| } |
| } |
| writer.write(line); |
| writer.write("\n"); |
| } |
| } finally { |
| reader.close(); |
| writer.close(); |
| } |
| System.err.println("done."); |
| } |
| |
| private static void download(URL url, String outputFile) |
| throws IOException { |
| final URLConnection connection = openConnection(url); |
| final InputStream inputStream = connection.getInputStream(); |
| final OutputStream outputStream = new FileOutputStream(outputFile); |
| int numBytes; |
| try { |
| while (-1 != (numBytes = inputStream.read(DOWNLOAD_BUFFER))) { |
| outputStream.write(DOWNLOAD_BUFFER, 0, numBytes); |
| } |
| } finally { |
| inputStream.close(); |
| outputStream.close(); |
| } |
| } |
| |
| private static URLConnection openConnection(URL url) throws IOException { |
| final URLConnection connection = url.openConnection(); |
| connection.setUseCaches(false); |
| connection.addRequestProperty("Cache-Control", "no-cache"); |
| connection.connect(); |
| return connection; |
| } |
| |
| private static void expandSingleRule |
| (StringBuilder builder, String leftHandSide, String rightHandSide) |
| throws IllegalArgumentException { |
| UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE); |
| boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches(); |
| for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) { |
| if (it.codepoint != UnicodeSetIterator.IS_STRING) { |
| if (numericValue) { |
| for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) { |
| builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>'); |
| builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp))); |
| builder.append(" # ").append(UCharacter.getName(cp)); |
| builder.append("\n"); |
| } |
| } else { |
| builder.append(String.format(Locale.ROOT, "%04X", it.codepoint)); |
| if (it.codepointEnd > it.codepoint) { |
| builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd)); |
| } |
| builder.append('>').append(rightHandSide).append("\n"); |
| } |
| } else { |
| System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet"); |
| System.exit(1); |
| } |
| } |
| } |
| } |