blob: 81c3ac77426e847df8993a5e25a18ab3a659d38f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.sis.measure;
import java.util.Map;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.text.Format;
import java.text.FieldPosition;
import java.text.ParsePosition;
import java.text.ParseException;
import java.util.ResourceBundle;
import java.util.MissingResourceException;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.security.AccessController;
import javax.measure.Dimension;
import javax.measure.Unit;
import javax.measure.format.ParserException;
import org.apache.sis.internal.system.Loggers;
import org.apache.sis.internal.util.Constants;
import org.apache.sis.internal.util.DefinitionURI;
import org.apache.sis.internal.util.FinalFieldSetter;
import org.apache.sis.internal.util.XPaths;
import org.apache.sis.math.Fraction;
import org.apache.sis.util.ArgumentChecks;
import org.apache.sis.math.MathFunctions;
import org.apache.sis.util.CharSequences;
import org.apache.sis.util.Characters;
import org.apache.sis.util.Localized;
import org.apache.sis.util.resources.Errors;
import org.apache.sis.util.CorruptedObjectException;
import org.apache.sis.util.collection.WeakValueHashMap;
import org.apache.sis.util.logging.Logging;
/**
* Parses and formats units of measurement as SI symbols, URI in OGC namespace or other symbols.
* This class combines in a single class the API from {@link java.text} and the API from {@link javax.measure.format}.
* In addition to the symbols of the <cite>Système international</cite> (SI), this class is also capable to handle
* some symbols found in <cite>Well Known Text</cite> (WKT) definitions or in XML files.
*
* <h2>Parsing authority codes</h2>
* As a special case, if a character sequence given to the {@link #parse(CharSequence)} method is of the
* {@code "EPSG:####"} or {@code "urn:ogc:def:uom:EPSG::####"} form (ignoring case and whitespaces),
* then {@code "####"} is parsed as an integer and forwarded to the {@link Units#valueOfEPSG(int)} method.
*
* <h2>NetCDF unit symbols</h2>
* The attributes in netCDF files often merge the axis direction with the angular unit,
* as in {@code "degrees_east"}, {@code "degrees_north"} or {@code "Degrees North"}.
* This class ignores those suffixes and unconditionally returns {@link Units#DEGREE} for all axis directions.
* In particular, the units for {@code "degrees_west"} and {@code "degrees_east"} do <strong>not</strong> have
* opposite sign. It is caller responsibility to handle the direction of axes associated to netCDF units.
*
* <h2>Multi-threading</h2>
* {@code UnitFormat} is generally not thread-safe. If units need to be parsed or formatted in different threads,
* each thread should have its own {@code UnitFormat} instance.
*
* @author Martin Desruisseaux (Geomatys)
* @version 1.0
*
* @see Units#valueOf(String)
*
* @since 0.8
* @module
*/
public class UnitFormat extends Format implements javax.measure.format.UnitFormat, Localized {
/**
* For cross-version compatibility.
*/
private static final long serialVersionUID = -3064428584419360693L;
/**
* The unit name for degrees (not necessarily angular), to be handled in a special way.
* Must contain only ASCII lower case letters ([a … z]).
*/
private static final String DEGREES = "degrees";
/**
* The unit name for dimensionless unit.
*/
private static final String UNITY = "unity";
/**
* The default instance used by {@link Units#valueOf(String)} for parsing units of measurement.
* While {@code UnitFormat} is generally not thread-safe, this particular instance is safe if
* we never invoke any setter method and we do not format with {@link Style#NAME}.
*/
static final UnitFormat INSTANCE = new UnitFormat();
/**
* The locale specified at construction time or modified by {@link #setLocale(Locale)}.
*
* @see #getLocale()
*/
private Locale locale;
/**
* Whether this {@code UnitFormat} should format long names like "metre" or use unit symbols.
*
* @see #getStyle()
*/
private Style style;
/**
* Identify whether unit formatting uses ASCII symbols, Unicode symbols or full localized names.
* For example the {@link Units#CUBIC_METRE} units can be formatted in the following ways:
*
* <ul>
* <li>As a symbol using Unicode characters: <b>m³</b></li>
* <li>As a symbol restricted to the ASCII characters set: <b>m3</b></li>
* <li>As a long name:<ul>
* <li>in English: <cite>cubic metre</cite></li>
* <li>in French: <cite>mètre cube</cite></li>
* </ul></li>
* </ul>
*
* @author Martin Desruisseaux (Geomatys)
* @version 0.8
* @since 0.8
* @module
*/
public enum Style {
/**
* Format unit symbols using Unicode characters. Units formatted in this style use superscript digits
* for exponents (as in “m³”), the dot operator (“⋅”) for multiplications, specialized characters when
* they exist (e.g. U+212A “K” for Kelvin sign), <i>etc.</i>
*
* <p>This is the default style of {@link UnitFormat}.</p>
*
* @see Unit#getSymbol()
*/
SYMBOL(AbstractUnit.MULTIPLY, AbstractUnit.DIVIDE),
/**
* Format unit symbols using a syntax close to the Unified Code for Units of Measure (UCUM) one.
* The character set is restricted to ASCII. The multiplication operator is the period (“.”).
*
* <h4>Modification to UCUM syntax rules</h4>
* UCUM does not allow floating point numbers in unit terms, so the use of period as an operator
* should not be ambiguous. However Apache SIS relaxes this restriction in order to support the
* scale factors commonly found in angular units (e.g. π/180). The meaning of a period in a string
* is resolved with two SIS-specific rules:
*
* <ul>
* <li>Unit symbols shall not begin or end with a decimal digit or a superscript.</li>
* <li>A period between two decimal digits is interpreted as a decimal separator.</li>
* </ul>
*
* @see org.apache.sis.util.CharSequences#toASCII(CharSequence)
*/
UCUM('.', '/') {
/** Replace non-ASCII characters on a "best effort" basis. */
@Override Appendable appendSymbol(final Appendable toAppendTo, final String value) throws IOException {
if (value.startsWith("°")) {
final int length = value.length();
if (length == 2) {
switch (value.charAt(1)) {
case 'C': return toAppendTo.append("Cel");
case 'K': // U+212A (Kelvin symbol)
case 'K': return toAppendTo.append('K');
}
}
return toAppendTo.append("deg").append(value, 1, length);
}
final CharSequence cs = CharSequences.toASCII(value);
final int length = cs.length();
for (int i=0; i<length; i++) {
toAppendTo.append(Characters.toNormalScript(cs.charAt(i)));
}
return toAppendTo;
}
/** Formats the power for a unit symbol. */
@Override void appendPower(final Appendable toAppendTo, final int power) throws IOException {
toAppendTo.append(String.valueOf(power));
}
/** Actually illegal for UCUM, but at least ensure that it contains only ASCII characters. */
@Override void appendPower(final Appendable toAppendTo, final Fraction power) throws IOException {
toAppendTo.append(EXPONENT).append(OPEN).append(String.valueOf(power.numerator))
.append('/').append(String.valueOf(power.denominator)).append(CLOSE);
}
},
/**
* Format unit symbols as localized long names if known, or Unicode symbols otherwise.
*
* @see Unit#getName()
*/
NAME(AbstractUnit.MULTIPLY, AbstractUnit.DIVIDE);
/**
* Other symbols not in the {@link Style} enumeration because common to all.
*/
static final char EXPONENT_OR_MULTIPLY = '*', EXPONENT = '^', OPEN = '(', CLOSE = ')';
/**
* Symbols to use for unit multiplications or divisions.
*/
final char multiply, divide;
/**
* Creates a new style using the given symbols.
*/
private Style(final char multiply, final char divide) {
this.multiply = multiply;
this.divide = divide;
}
/**
* Appends a string that may contains Unicode characters. The enumeration is responsible
* for converting the Unicode characters into ASCII ones if needed.
*/
Appendable appendSymbol(final Appendable toAppendTo, final String value) throws IOException {
return toAppendTo.append(value);
}
/**
* Appends an integer power. The power may be added as an exponent if allowed by the format style.
*/
void appendPower(final Appendable toAppendTo, final int power) throws IOException {
if (power >= 0 && power <= 9) {
toAppendTo.append(Characters.toSuperScript((char) (power + '0')));
} else {
toAppendTo.append(String.valueOf(power));
}
}
/**
* Appends a rational power.
*/
void appendPower(final Appendable toAppendTo, final Fraction power) throws IOException {
toAppendTo.append(EXPONENT);
final String value = power.toString();
if (value.length() == 1) {
toAppendTo.append(value);
} else {
toAppendTo.append(OPEN).append(value).append(CLOSE);
}
}
}
/**
* Symbols or names to use for formatting units in replacement to the default unit symbols or names.
* The {@link Unit} instances are the ones specified by user in calls to {@link #label(Unit, String)}.
*
* @see #label(Unit, String)
*/
private final Map<Unit<?>,String> unitToLabel;
/**
* Units associated to a given label (in addition to the system-wide {@link UnitRegistry}).
* This map is the converse of {@link #unitToLabel}. The {@link Unit} instances may differ from the ones
* specified by user since {@link AbstractUnit#symbol} may have been set to the label specified by the user.
* The labels may contain some characters normally not allowed in unit symbols, like white spaces.
*
* @see #label(Unit, String)
*/
private final Map<String,Unit<?>> labelToUnit;
/**
* The mapping from unit symbols to long localized names.
* Those resources are locale-dependent and loaded when first needed.
*
* @see #symbolToName()
*/
private transient volatile ResourceBundle symbolToName;
/**
* Mapping from long localized and unlocalized names to unit instances.
* This map is used only for parsing and created when first needed.
*
* @see #fromName(String)
*/
private transient volatile Map<String,Unit<?>> nameToUnit;
/**
* Cached values of {@link #nameToUnit}, for avoiding to load the same information many time and for saving memory
* if the user create many {@code UnitFormat} instances. Note that we do not cache {@link #symbolToName} because
* {@link ResourceBundle} already provides its own caching mechanism.
*
* @see #fromName(String)
*/
private static final WeakValueHashMap<Locale, Map<String,Unit<?>>> SHARED = new WeakValueHashMap<>(Locale.class);
/**
* Creates the unique {@link #INSTANCE}.
*/
private UnitFormat() {
locale = Locale.ROOT;
style = Style.SYMBOL;
unitToLabel = Collections.emptyMap();
labelToUnit = Collections.emptyMap();
}
/**
* Creates a new format for the given locale.
*
* @param locale the locale to use for parsing and formatting units.
*/
public UnitFormat(final Locale locale) {
ArgumentChecks.ensureNonNull("locale", locale);
this.locale = locale;
style = Style.SYMBOL;
unitToLabel = new HashMap<>();
labelToUnit = new HashMap<>();
}
/**
* Returns the locale used by this {@code UnitFormat}.
*
* @return the locale of this {@code UnitFormat}.
*/
@Override
public Locale getLocale() {
return locale;
}
/**
* Sets the locale that this {@code UnitFormat} will use for long names.
* For example a call to <code>setLocale({@linkplain Locale#US})</code>
* instructs this formatter to use the “meter” spelling instead of “metre”.
*
* @param locale the new locale for this {@code UnitFormat}.
*
* @see UnitServices#getUnitFormat(String)
*/
public void setLocale(final Locale locale) {
ArgumentChecks.ensureNonNull("locale", locale);
this.locale = locale;
symbolToName = null; // Force reloading for the new locale.
nameToUnit = null;
}
/**
* Returns whether this {@code UnitFormat} depends on the {@link Locale} given at construction time
* for performing its tasks. This method returns {@code true} if formatting long names (e.g. “metre”
* or “meter”} and {@code false} if formatting only the unit symbol (e.g. “m”).
*
* @return {@code true} if formatting depends on the locale.
*/
@Override
public boolean isLocaleSensitive() {
return style == Style.NAME;
}
/**
* Returns whether unit formatting uses ASCII symbols, Unicode symbols or full localized names.
*
* @return the style of units formatted by this {@code UnitFormat} instance.
*/
public Style getStyle() {
return style;
}
/**
* Sets whether unit formatting should use ASCII symbols, Unicode symbols or full localized names.
*
* @param style the desired style of units.
*/
public void setStyle(final Style style) {
ArgumentChecks.ensureNonNull("style", style);
this.style = style;
}
/**
* Attaches a label to the specified unit. A <cite>label</cite> can be a substitute to either the
* {@linkplain AbstractUnit#getSymbol() unit symbol} or the {@link AbstractUnit#getName() unit name},
* depending on the {@linkplain #getStyle() format style}.
* If the specified label is already associated to another unit, then the previous association is discarded.
*
* <h4>Restriction on character set</h4>
* Current implementation accepts only {@linkplain Character#isLetter(int) letters},
* {@linkplain Characters#isSubScript(int) subscripts}, {@linkplain Character#isSpaceChar(int) spaces}
* (including non-breaking spaces but not CR/LF characters),
* the degree sign (°) and a few other characters like underscore.
* The set of legal characters may be expanded in future Apache SIS versions,
* but the following restrictions are likely to remain:
*
* <ul>
* <li>The following characters are reserved since they have special meaning in UCUM format, in URI
* or in Apache SIS parser: <blockquote>" # ( ) * + - . / : = ? [ ] { } ^ ⋅ ∕</blockquote></li>
* <li>The symbol can not begin or end with digits, since such digits would be confused with unit power.</li>
* </ul>
*
* @param unit the unit being labeled.
* @param label the new label for the given unit.
* @throws IllegalArgumentException if the given label is not a valid unit name.
*/
@Override
public void label(final Unit<?> unit, String label) {
ArgumentChecks.ensureNonNull ("unit", unit);
label = CharSequences.trimWhitespaces(label);
ArgumentChecks.ensureNonEmpty("label", label);
for (int i=0; i < label.length();) {
final int c = label.codePointAt(i);
if (!AbstractUnit.isSymbolChar(c) && !Character.isSpaceChar(c)) { // NOT Character.isWhitespace(int)
throw new IllegalArgumentException(Errors.format(Errors.Keys.IllegalArgumentValue_2, "label", label));
}
i += Character.charCount(c);
}
Unit<?> labeledUnit = unit;
if (labeledUnit instanceof ConventionalUnit<?>) {
labeledUnit = ((ConventionalUnit<?>) labeledUnit).forSymbol(label);
}
final Unit<?> unitForOldLabel = labelToUnit.remove(unitToLabel.put(unit, label));
final Unit<?> oldUnitForLabel = labelToUnit.put(label, labeledUnit);
if (oldUnitForLabel != null && !oldUnitForLabel.equals(labeledUnit) && !label.equals(unitToLabel.remove(oldUnitForLabel))) {
/*
* Assuming there is no bug in our algorithm, this exception should never happen
* unless this UnitFormat has been modified concurrently in another thread.
*/
throw new CorruptedObjectException("unitToLabel");
}
if (unitForOldLabel != null && !unitForOldLabel.getSystemUnit().equals(unit.getSystemUnit())) {
/*
* Assuming there is no bug in our algorithm, this exception should never happen
* unless this UnitFormat has been modified concurrently in another thread.
* We compared system units because the units may not be strictly equal
* as a result of the call to ConventionalUnit.forSymbol(label).
*/
throw new CorruptedObjectException("labelToUnit");
}
}
/**
* Loads the {@code UnitNames} resource bundle for the given locale.
*/
static ResourceBundle getBundle(final Locale locale) {
return ResourceBundle.getBundle("org.apache.sis.measure.UnitNames", locale, UnitFormat.class.getClassLoader());
}
/**
* Returns the mapping from unit symbols to long localized names.
* This mapping is loaded when first needed and memorized as long as the locale does not change.
*/
private ResourceBundle symbolToName() {
ResourceBundle r = symbolToName;
if (r == null) {
symbolToName = r = getBundle(locale);
}
return r;
}
/**
* Returns the unit instance for the given long (un)localized name.
* This method is somewhat the converse of {@link #symbolToName()}, but recognizes also
* international and American spelling of unit names in addition of localized names.
* The intent is to recognize "meter" as well as "metre".
*
* <p>While we said that {@code UnitFormat} is not thread safe, we make an exception for this method
* for allowing the singleton {@link #INSTANCE} to parse symbols in a multi-threads environment.</p>
*
* @param uom the unit symbol, without leading or trailing spaces.
* @return the unit for the given name, or {@code null} if unknown.
*/
private Unit<?> fromName(String uom) {
/*
* Before to search in resource bundles, check for degrees units. The "deg" unit can be both angular
* and Celsius degrees. We try to resolve this ambiguity by looking for the "C" suffix. We perform a
* special case for the degrees units because SI symbols are case-sentive and unit names in resource
* bundles are case-insensitive, but the "deg" case is a mix of both.
*/
final int length = uom.length();
for (int i=0; ; i++) {
if (i != DEGREES.length()) {
if (i != length && (uom.charAt(i) | ('a' - 'A')) == DEGREES.charAt(i)) {
continue; // Loop as long as the characters are the same, ignoring case.
}
if (i != 3 && i != 6) {
break; // Exit if not "deg" (3) or "degree" (6 characters).
}
}
if (length == i) {
return Units.DEGREE; // Exactly "deg", "degree" or "degrees" (ignoring case).
}
final int c = uom.codePointAt(i);
if (c == '_' || Character.isSpaceChar(c)) {
i += Character.charCount(c); // Ignore space in "degree C", "deg C", "deg K", etc.
}
if (length - i == 1) {
switch (uom.charAt(i)) {
case 'K': // Unicode U+212A
case 'K': return Units.KELVIN; // "degK" (ignoring case except for 'K')
case 'C': return Units.CELSIUS;
case 'N': // degree_N, degrees_N, degreeN, degreesN.
case 'E': return Units.DEGREE; // degree_E, degrees_E, degreeE, degreesE.
}
}
break;
}
/*
* At this point, we determined that the given unit symbol is not degrees (of angle or of temperature).
* Remaining code is generic to all other kinds of units: a check in a HashMap loaded when first needed.
*/
Map<String,Unit<?>> map = nameToUnit;
if (map == null) {
map = SHARED.get(locale);
if (map == null) {
map = new HashMap<>(128);
copy(locale, symbolToName(), map);
if (!locale.equals(Locale.US)) copy(Locale.US, getBundle(Locale.US), map);
if (!locale.equals(Locale.ROOT)) copy(Locale.ROOT, getBundle(Locale.ROOT), map);
/*
* The UnitAliases file contains names that are not unit symbols and are not included in the UnitNames
* property files neither. It contains longer names sometime used (for example "decimal degree" instead
* of "degree"), some plural forms (for example "feet" instead of "foot") and a few common misspellings
* (for exemple "Celcius" instead of "Celsius").
*/
final ResourceBundle r = ResourceBundle.getBundle("org.apache.sis.measure.UnitAliases", locale, UnitFormat.class.getClassLoader());
for (final String name : r.keySet()) {
map.put(name.intern(), Units.get(r.getString(name)));
}
map = Collections.unmodifiableMap(map);
/*
* Cache the map so we can share it with other UnitFormat instances.
* Sharing is safe if the map is unmodifiable.
*/
synchronized (SHARED) {
for (final Map<String,Unit<?>> existing : SHARED.values()) {
if (map.equals(existing)) {
map = existing;
break;
}
}
SHARED.put(locale, map);
}
}
nameToUnit = map;
}
/*
* The 'nameToUnit' map contains plural forms (declared in UnitAliases.properties),
* but we make a special case for "degrees", "metres" and "meters" because they
* appear in numerous places.
*/
uom = uom.replace('_', ' ').toLowerCase(locale);
uom = CharSequences.replace(CharSequences.replace(CharSequences.replace(CharSequences.toASCII(uom),
"meters", "meter"),
"metres", "metre"),
DEGREES, "degree").toString();
/*
* Returns the unit with application of the power if it is part of the name.
* For example this method interprets "meter2" as "meter" raised to power 2.
*/
Unit<?> unit = map.get(uom);
appPow: if (unit == null) {
int s = uom.length();
if (--s > 0 && isDigit(uom.charAt(s))) {
do if (--s < 0) break appPow;
while (isDigit(uom.charAt(s)));
if (uom.charAt(s) == '-') {
if (--s < 0) break appPow;
}
unit = map.get(uom.substring(0, ++s));
if (unit != null) {
unit = unit.pow(Integer.parseInt(uom.substring(s)));
}
}
}
return unit;
}
/**
* Copies all entries from the given "symbols to names" mapping to the given "names to units" mapping.
* During this copy, keys are converted from symbols to names and values are converted from symbols to
* {@code Unit} instance. We use {@code Unit} values instead of their symbols because all {@code Unit}
* instances are created at {@link Units} class initialization anyway (so we do not create new instance
* here), and it avoid to retain references to the {@link String} instances loaded by the resource bundle.
*/
private static void copy(final Locale locale, final ResourceBundle symbolToName, final Map<String,Unit<?>> nameToUnit) {
for (final String symbol : symbolToName.keySet()) {
nameToUnit.put(CharSequences.toASCII(symbolToName.getString(symbol).toLowerCase(locale)).toString().intern(), Units.get(symbol));
}
}
/**
* Formats the specified unit.
* This method performs the first of the following actions that can be done.
*
* <ol>
* <li>If a {@linkplain #label(Unit, String) label has been specified} for the given unit,
* then that label is appended unconditionally.</li>
* <li>Otherwise if the formatting style is {@link Style#NAME} and the {@link Unit#getName()} method
* returns a non-null value, then that value is appended. {@code Unit} instances implemented by
* Apache SIS are handled in a special way for localizing the name according the
* {@linkplain #setLocale(Locale) locale specified to this format}.</li>
* <li>Otherwise if the {@link Unit#getSymbol()} method returns a non-null value,
* then that value is appended.</li>
* <li>Otherwise a default symbol is created from the entries returned by {@link Unit#getBaseUnits()}.</li>
* </ol>
*
* @param unit the unit to format.
* @param toAppendTo where to format the unit.
* @return the given {@code toAppendTo} argument, for method calls chaining.
* @throws IOException if an error occurred while writing to the destination.
*/
@Override
public Appendable format(final Unit<?> unit, final Appendable toAppendTo) throws IOException {
ArgumentChecks.ensureNonNull("unit", unit);
ArgumentChecks.ensureNonNull("toAppendTo", toAppendTo);
/*
* Choice 1: label specified by a call to label(Unit, String).
*/
{
final String label = unitToLabel.get(unit);
if (label != null) {
return toAppendTo.append(label);
}
}
/*
* Choice 2: value specified by Unit.getName(). We skip this check if the given Unit is an instance
* implemented by Apache SIS because AbstractUnit.getName() delegates to the same resource bundle
* than the one used by this block. We are better to use the resource bundle of the UnitFormat both
* for performance reasons and because the locale may not be the same.
*/
if (style == Style.NAME) {
if (!(unit instanceof AbstractUnit)) {
final String label = unit.getName();
if (label != null) {
return toAppendTo.append(label);
}
} else {
String label = unit.getSymbol();
if (label != null) {
if (label.isEmpty()) {
label = UNITY;
}
// Following is not thread-safe, but it is okay since we do not use INSTANCE for unit names.
final ResourceBundle names = symbolToName();
try {
label = names.getString(label);
} catch (MissingResourceException e) {
Logging.ignorableException(Logging.getLogger(Loggers.MEASURE), UnitFormat.class, "format", e);
// Name not found; use the symbol as a fallback.
}
return toAppendTo.append(label);
}
}
}
/*
* Choice 3: if the unit has a specific symbol, appends that symbol.
* Apache SIS implementation uses Unicode characters in the symbol, which are not valid for UCUM.
* But Styme.UCUM.appendSymbol(…) performs required replacements.
*/
{
final String symbol = unit.getSymbol();
if (symbol != null) {
return style.appendSymbol(toAppendTo, symbol);
}
}
/*
* Choice 4: if all the above failed, fallback on a symbol created from the base units and their power.
* Note that this may produce more verbose symbols than needed because derived units like Volt or Watt
* are decomposed into their base SI units. The scale factor will be inserted before the unit components,
* e.g. "30⋅m∕s". Note that a scale factor relative to system unit may not be what we want if the unit
* contains "kg", since it block us from using SI prefixes. But in many cases (not all), a symbol will
* have been created by SystemUnit.transform(…), in which case "Choice 3" above would have been executed.
*/
final Unit<?> unscaled = unit.getSystemUnit();
@SuppressWarnings("unchecked") // Both 'unit' and 'unscaled' are 'Unit<Q>'.
final double scale = AbstractConverter.scale(unit.getConverterTo((Unit) unscaled));
if (Double.isNaN(scale)) {
throw new IllegalArgumentException(Errors.format(Errors.Keys.NonRatioUnit_1,
"?⋅" + Style.OPEN + unscaled + Style.CLOSE));
}
/*
* In addition of the scale, we will need to know:
*
* - The components (for example "m" and "s" in "m∕s").
* - Whether we have at least one component on the left side of "∕" operation.
* Used for determining if we should prepend "1" before the "∕" symbol.
* - If there is exactly one component on the left side of "∕" and that component
* is prefixable, the power raising that component. Used for choosing a prefix.
*/
int prefixPower = 0;
boolean hasNumerator = false;
final Map<? extends Unit<?>, ? extends Number> components;
if (unscaled instanceof AbstractUnit<?>) {
// In Apache SIS implementation, power may be fractional.
final Map<SystemUnit<?>, Fraction> c = ((AbstractUnit<?>) unscaled).getBaseSystemUnits();
components = c;
for (final Map.Entry<SystemUnit<?>, Fraction> e : c.entrySet()) {
final Fraction power = e.getValue();
if (power.signum() > 0) {
hasNumerator = true;
if (prefixPower == 0 && power.denominator == 1 && e.getKey().isPrefixable()) {
prefixPower = power.numerator;
} else {
prefixPower = 0;
break;
}
}
}
} else {
// Fallback for foreigner implementations (power restricted to integer).
Map<? extends Unit<?>, Integer> c = unscaled.getBaseUnits();
if (c == null) c = Collections.singletonMap(unit, 1);
components = c;
for (final Map.Entry<? extends Unit<?>, Integer> e : c.entrySet()) {
final int power = e.getValue();
if (power > 0) {
hasNumerator = true;
if (prefixPower == 0 && AbstractUnit.isPrefixable(e.getKey())) {
prefixPower = power;
} else {
prefixPower = 0;
break;
}
}
}
}
/*
* Append the scale factor. If we can use a prefix (e.g. "km" instead of "1000⋅m"), we will do that.
* Otherwise if the scale is a power of 10 and we are allowed to use Unicode symbols, we will write
* for example 10⁵⋅m instead of 100000⋅m. If the scale is not a power of 10, or if we are requested
* to format UCUM symbol, then we fallback on the usual 'Double.toString(double)' representation.
*/
if (scale != 1) {
final char prefix = Prefixes.symbol(scale, prefixPower);
if (prefix != 0) {
toAppendTo.append(Prefixes.concat(prefix, ""));
} else {
boolean asPowerOf10 = (style != Style.UCUM);
if (asPowerOf10) {
double power = Math.log10(scale);
asPowerOf10 = AbstractConverter.epsilonEquals(power, power = Math.round(power));
if (asPowerOf10) {
toAppendTo.append("10");
final String text = Integer.toString((int) power);
for (int i=0; i<text.length(); i++) {
toAppendTo.append(Characters.toSuperScript(text.charAt(i)));
}
}
}
if (!asPowerOf10) {
final String text = Double.toString(scale);
int length = text.length();
if (text.endsWith(".0")) length -= 2;
toAppendTo.append(text, 0, length);
}
/*
* The 'formatComponents' method appends division symbol only, no multiplication symbol.
* If we have formatted a scale factor and there is at least one component to multiply,
* we need to append the multiplication symbol ourselves. Note that 'formatComponents'
* put numerators before denominators, so we are sure that the first term after the
* multiplication symbol is a numerator.
*/
if (hasNumerator) {
toAppendTo.append(style.multiply);
}
}
} else if (!hasNumerator) {
toAppendTo.append('1');
}
formatComponents(components, style, toAppendTo);
return toAppendTo;
}
/**
* Creates a new symbol (e.g. "m/s") from the given symbols and factors.
* Keys in the given map can be either {@link Unit} or {@link Dimension} instances.
* Values in the given map are either {@link Integer} or {@link Fraction} instances.
*
* @param components the components of the symbol to format.
* @param style whether to allow Unicode characters.
* @param toAppendTo where to write the symbol.
*/
static void formatComponents(final Map<?, ? extends Number> components, final Style style, final Appendable toAppendTo)
throws IOException
{
boolean isFirst = true;
final List<Map.Entry<?,? extends Number>> deferred = new ArrayList<>(components.size());
for (final Map.Entry<?,? extends Number> entry : components.entrySet()) {
final Number power = entry.getValue();
final int n = (power instanceof Fraction) ? ((Fraction) power).numerator : power.intValue();
if (n > 0) {
if (!isFirst) {
toAppendTo.append(style.multiply);
}
isFirst = false;
formatComponent(entry, false, style, toAppendTo);
} else if (n != 0) {
deferred.add(entry);
}
}
/*
* At this point, all numerators have been appended. Now append the denominators together.
* For example pressure dimension is formatted as M∕(L⋅T²) no matter if 'M' was the first
* dimension in the given 'components' map or not.
*/
if (!deferred.isEmpty()) {
toAppendTo.append(style.divide);
final boolean useParenthesis = (deferred.size() > 1);
if (useParenthesis) {
toAppendTo.append(Style.OPEN);
}
isFirst = true;
for (final Map.Entry<?,? extends Number> entry : deferred) {
if (!isFirst) {
toAppendTo.append(style.multiply);
}
isFirst = false;
formatComponent(entry, true, style, toAppendTo);
}
if (useParenthesis) {
toAppendTo.append(Style.CLOSE);
}
}
}
/**
* Formats a single unit or dimension raised to the given power.
*
* @param entry the base unit or base dimension to format, together with its power.
* @param inverse {@code true} for inverting the power sign.
* @param style whether to allow Unicode characters.
*/
private static void formatComponent(final Map.Entry<?,? extends Number> entry, final boolean inverse,
final Style style, final Appendable toAppendTo) throws IOException
{
formatSymbol(entry.getKey(), style, toAppendTo);
final Number power = entry.getValue();
int n;
if (power instanceof Fraction) {
Fraction f = (Fraction) power;
if (f.denominator != 1) {
if (inverse) {
f = f.negate();
}
style.appendPower(toAppendTo, f);
return;
}
n = f.numerator;
} else {
n = power.intValue();
}
if (inverse) n = -n;
if (n != 1) {
style.appendPower(toAppendTo, n);
}
}
/**
* Appends the symbol for the given base unit of base dimension, or "?" if no symbol was found.
* If the given object is a unit, then it should be an instance of {@link SystemUnit}.
*
* @param base the base unit or base dimension to format.
* @param style whether to allow Unicode characters.
* @param toAppendTo where to append the symbol.
*/
private static void formatSymbol(final Object base, final Style style, final Appendable toAppendTo) throws IOException {
if (base instanceof UnitDimension) {
final char symbol = ((UnitDimension) base).symbol;
if (symbol != 0) {
toAppendTo.append(symbol);
return;
}
}
if (base instanceof Unit<?>) {
final String symbol = ((Unit<?>) base).getSymbol();
if (symbol != null) {
style.appendSymbol(toAppendTo, symbol);
return;
}
}
toAppendTo.append('?');
}
/**
* Formats the specified unit in the given buffer.
* This method delegates to {@link #format(Unit, Appendable)}.
*
* @param unit the unit to format.
* @param toAppendTo where to format the unit.
* @param pos where to store the position of a formatted field.
* @return the given {@code toAppendTo} argument, for method calls chaining.
*/
@Override
public StringBuffer format(final Object unit, final StringBuffer toAppendTo, final FieldPosition pos) {
try {
return (StringBuffer) format((Unit<?>) unit, toAppendTo);
} catch (IOException e) {
throw new UncheckedIOException(e); // Should never happen since we are writting to a StringBuffer.
}
}
/**
* Formats the given unit.
* This method delegates to {@link #format(Unit, Appendable)}.
*
* @param unit the unit to format.
* @return the formatted unit.
*/
@Override
public String format(final Unit<?> unit) {
try {
return format(unit, new StringBuilder()).toString();
} catch (IOException e) {
throw new UncheckedIOException(e); // Should never happen since we are writting to a StringBuilder.
}
}
/**
* Returns {@code 0} or {@code 1} if the {@code '*'} character at the given index stands for exponentiation
* instead than multiplication, or a negative value if the character stands for multiplication. This check
* is used for heuristic rules at parsing time. Current implementation applies the following rules:
*
* <ul>
* <li>The operation is presumed an exponentiation if the '*' symbol is doubled, as in {@code "m**s-1"}.</li>
* <li>The operation is presumed an exponentiation if it is surrounded by digits or a sign on its right side.
* Example: {@code "10*-6"}, which means 1E-6 in UCUM syntax.</li>
* <li>All other cases are currently presumed multiplication.
* Example: {@code "m*s"}.</li>
* </ul>
*
* @return -1 for parsing as a multiplication, or a positive value for exponentiation.
* If positive, this is the number of characters in the exponent symbol minus 1.
*/
private static int exponentOperator(final CharSequence symbols, int i, final int length) {
if (i >= 0 && ++i < length) {
final char c = symbols.charAt(i); // No need for code point because next conditions are true only in BMP.
if (c == Style.EXPONENT_OR_MULTIPLY) {
return 1; // "**" operator: need to skip one character after '*'.
}
if ((isDigit(c) || isSign(c)) && isDigit(symbols.charAt(i-2))) {
return 0; // "*" operator surrounded by digits: no character to skip.
}
}
return -1;
}
/**
* Returns {@code true} if the {@code '.'} character at the given index is surrounded by digits or
* is at the beginning or the end of the character sequences. This check is used for heuristic rules.
*
* @see Style#UCUM
*/
private static boolean isDecimalSeparator(final CharSequence symbols, int i, final int length) {
return (i == 0 || isDigit(symbols.charAt(i-1)) &&
(++i >= length || isDigit(symbols.charAt(i))));
}
/**
* Returns {@code true} if the given character is a digit in the sense of the {@code UnitFormat} parser.
* Note that "digit" is taken here in a much more restrictive way than {@link Character#isDigit(int)}.
*
* <p>A return value of {@code true} guarantees that the given character is in the Basic Multilingual Plane (BMP).
* Consequently the {@code c} argument value does not need to be the result of {@link String#codePointAt(int)};
* the result of {@link String#charAt(int)} is sufficient. We nevertheless use the {@code int} type for avoiding
* the need to cast if caller uses code points for another reason.</p>
*
* @see Character#isBmpCodePoint(int)
*/
private static boolean isDigit(final int c) {
return c >= '0' && c <= '9';
}
/**
* Returns {@code true} if the given character is the sign of a number according the {@code UnitFormat} parser.
* A return value of {@code true} guarantees that the given character is in the Basic Multilingual Plane (BMP).
* Consequently the {@code c} argument value does not need to be the result of {@link String#codePointAt(int)}.
*/
private static boolean isSign(final int c) {
return c == '+' || c == '-';
}
/**
* Returns {@code true} if the given character is the sign of a division operator.
* A return value of {@code true} guarantees that the given character is in the Basic Multilingual Plane (BMP).
* Consequently the {@code c} argument value does not need to be the result of {@link String#codePointAt(int)}.
*/
private static boolean isDivisor(final int c) {
return c == '/' || c == AbstractUnit.DIVIDE;
}
/**
* Returns {@code true} if the given character sequence contains at least one digit.
* This is a hack for allowing to recognize units like "100 feet" (in principle not
* legal, but seen in practice). This verification has some value if digits are not
* allowed as unit label or symbol.
*/
private static boolean hasDigit(final CharSequence symbol, int lower, final int upper) {
while (lower < upper) {
if (isDigit(symbol.charAt(lower++))) {
return true;
}
}
return false;
}
/**
* Parse position when text to be parsed is expected to contain nothing else than a unit symbol.
* This is used for recording whether another term (separated from the previous term by a space)
* is allowed or not.
*/
private static final class Position extends ParsePosition {
/** {@code true} if we do not expect any more content after the last term parsed. */
boolean finished;
/** Creates a new position initialized to the beginning of the text to parse. */
Position() {
super(0);
}
}
/**
* Reports that the parsing is finished and no more content should be parsed.
* This method is invoked when the last parsed term is possibly one or more words instead than unit symbols.
* The intent is to avoid trying to parse "degree minute" as "degree × minute". By contrast, this method is
* not invoked if the string to parse is "m kg**-2" because it can be interpreted as "m × kg**-2".
*/
private static void finish(final ParsePosition pos) {
if (pos instanceof Position) {
((Position) pos).finished = true;
}
}
/**
* Parses the given text as an instance of {@code Unit}.
* If the parse completes without reading the entire length of the text, an exception is thrown.
*
* <p>The parsing is lenient: symbols can be products or quotients of units like “m∕s”,
* words like “meters per second”, or authority codes like {@code "urn:ogc:def:uom:EPSG::1026"}.
* The product operator can be either {@code '.'} (ASCII) or {@code '⋅'} (Unicode) character.
* Exponent after symbol can be decimal digits as in “m2” or a superscript as in “m²”.</p>
*
* <p>This method differs from {@link #parse(CharSequence, ParsePosition)} in the treatment of white spaces:
* that method with a {@link ParsePosition} argument stops parsing at the first white space,
* while this {@code parse(…)} method treats white spaces as multiplications.
* The reason for this difference is that white space is normally not a valid multiplication symbol;
* it could be followed by a text which is not part of the unit symbol.
* But in the case of this {@code parse(CharSequence)} method, the whole {@code CharSequence} shall be a unit symbol.
* In such case, white spaces are less ambiguous.</p>
*
* <p>The default implementation delegates to
* <code>{@linkplain #parse(CharSequence, ParsePosition) parse}(symbols, new ParsePosition(0))</code>
* and verifies that all non-white characters have been parsed.
* Units separated by spaces are multiplied; for example "kg m**-2" is parsed as kg/m².</p>
*
* @param symbols the unit symbols or URI to parse.
* @return the unit parsed from the specified symbols.
* @throws ParserException if a problem occurred while parsing the given symbols.
*
* @see Units#valueOf(String)
*/
@Override
public Unit<?> parse(final CharSequence symbols) throws ParserException {
final Position position = new Position();
Unit<?> unit = parse(symbols, position);
final int length = symbols.length();
int unrecognized;
while ((unrecognized = CharSequences.skipLeadingWhitespaces(symbols, position.getIndex(), length)) < length) {
if (position.finished || !Character.isLetter(Character.codePointAt(symbols, unrecognized))) {
throw new ParserException(Errors.format(Errors.Keys.UnexpectedCharactersAfter_2,
CharSequences.trimWhitespaces(symbols, 0, unrecognized),
CharSequences.trimWhitespaces(symbols, unrecognized, length)),
symbols, unrecognized);
}
position.setIndex(unrecognized);
unit = unit.multiply(parse(symbols, position));
}
return unit;
}
/**
* Parses a portion of the given text as an instance of {@code Unit}.
* Parsing begins at the index given by {@link ParsePosition#getIndex()}.
* After parsing, the above-cited index is updated to the first unparsed character.
*
* <p>The parsing is lenient: symbols can be products or quotients of units like “m∕s”,
* words like “meters per second”, or authority codes like {@code "urn:ogc:def:uom:EPSG::1026"}.
* The product operator can be either {@code '.'} (ASCII) or {@code '⋅'} (Unicode) character.
* Exponent after symbol can be decimal digits as in “m2” or a superscript as in “m²”.</p>
*
* <p>Note that contrarily to {@link #parseObject(String, ParsePosition)}, this method never return {@code null}.
* If an error occurs at parsing time, an unchecked {@link ParserException} is thrown.</p>
*
* @param symbols the unit symbols to parse.
* @param position on input, index of the first character to parse.
* On output, index after the last parsed character.
* @return the unit parsed from the specified symbols.
* @throws ParserException if a problem occurred while parsing the given symbols.
*/
@SuppressWarnings({"null", "fallthrough"})
public Unit<?> parse(CharSequence symbols, final ParsePosition position) throws ParserException {
ArgumentChecks.ensureNonNull("symbols", symbols);
ArgumentChecks.ensureNonNull("position", position);
/*
* Check for authority codes (currently only EPSG, but more could be added later).
* Example: "urn:ogc:def:uom:EPSG::9001". If the unit is not an authority code
* (which is the most common case), only then we will parse the unit symbols.
*/
int end = symbols.length();
int start = CharSequences.skipLeadingWhitespaces(symbols, position.getIndex(), end);
int endOfURI = XPaths.endOfURI(symbols, start);
if (endOfURI >= 0) {
final String uom = symbols.subSequence(start, endOfURI).toString();
String code = DefinitionURI.codeOf("uom", Constants.EPSG, uom);
/*
* DefinitionURI.codeOf(…) returns 'uom' directly (provided that whitespaces were already trimmed)
* if no ':' character were found, in which case the string is assumed to be the code directly.
* This is the intended behavior for AuthorityFactory, but in the particular case of this method
* we want to try to parse as a xpointer before to give up.
*/
if (code != null && code != uom) {
NumberFormatException failure = null;
try {
final Unit<?> unit = Units.valueOfEPSG(Integer.parseInt(code));
if (unit != null) {
position.setIndex(endOfURI);
finish(position);
return unit;
}
} catch (NumberFormatException e) {
failure = e;
}
throw (ParserException) new ParserException(Errors.format(Errors.Keys.UnknownUnit_1,
Constants.EPSG + Constants.DEFAULT_SEPARATOR + code),
symbols, start + Math.max(0, uom.lastIndexOf(code))).initCause(failure);
}
/*
* Not an EPSG code. Maybe it is a URI like this example:
* http://schemas.opengis.net/iso/19139/20070417/resources/uom/gmxUom.xml#xpointer(//*[@gml:id='m'])
*
* If we find such 'uom' value, we could replace 'symbols' by that 'uom'. But it would cause a wrong
* error index to be reported in case of parsing failure. We will rather try to adjust the indices
* (and replace 'symbols' only in last resort).
*/
code = XPaths.xpointer("uom", uom);
if (code != null) {
final int base = start;
start = endOfURI - code.length();
do if (--start < base) { // Should never happen (see above comment), but we are paranoiac.
symbols = code;
start = 0;
break;
} while (!CharSequences.regionMatches(symbols, start, code));
end = start + code.length();
} else {
endOfURI = -1;
}
}
/*
* Split the unit around the multiplication and division operators and parse each term individually.
* Note that exponentation need to be kept as part of a single unit symbol.
*
* The 'start' variable is the index of the first character of the next unit term to parse.
*/
final Operation operation = new Operation(symbols); // Enumeration value: NOOP, IMPLICIT, MULTIPLY, DIVIDE.
Unit<?> unit = null;
boolean hasSpaces = false;
int i = start;
scan: for (int n; i < end; i += n) {
final int c = Character.codePointAt(symbols, i);
n = Character.charCount(c);
final int next;
switch (c) {
/*
* The minus sign can be both part of a number or part of a symbol. If the minus sign if followed
* by a digit, then handle it as part of a number, in which case the action is only "continue".
* Otherwise handle as part of a symbol, in which case the action is in the default case below.
* The intent is to prevent the replacement of Operation.IMPLICIT by Operation.MULTIPLY in symbol
* like "(m²⋅s)-1" because we want the "-1" part to be handled as Operation.EXPONENT instead.
*/
case '-': {
if (i + n < end && Character.isDigit(Character.codePointAt(symbols, i + n))) {
continue;
}
// else fall through.
}
/*
* For any character that are is not an operator or parenthesis, either continue the scanning of
* characters or stop it, depending on whether the character is valid for a unit symbol or not.
* In the later case, we consider that we reached the end of a unit symbol.
*/
default: {
if (AbstractUnit.isSymbolChar(c)) {
if (operation.code == Operation.IMPLICIT) {
operation.code = Operation.MULTIPLY;
}
continue;
}
if (Character.isDigit(c) || Characters.isSuperScript(c)) {
continue;
}
if (Character.isSpaceChar(c)) { // NOT Character.isWhitespace(int)
hasSpaces = true;
continue;
}
break scan;
}
/*
* Star is for exponentiation in UCUM syntax, but some symbols may use it for unit multiplication.
* We interpret the symbol as a multiplication if the characters before or after it seem to be for
* a unit symbol.
*/
case Style.EXPONENT_OR_MULTIPLY: {
final int w = exponentOperator(symbols, i, end);
if (w < 0) {
next = Operation.MULTIPLY;
break;
}
i += w;
// else fall through.
}
case Style.EXPONENT: {
if (operation.code == Operation.IMPLICIT) {
next = Operation.EXPONENT;
break;
}
continue;
}
/*
* The period is the multiplication operator in UCUM format. According UCUM there is no ambiguity
* with the decimal separator since unit terms should not contain floating point numbers. However
* we relax this rule in order to support scale factor of angular units (e.g. π/180). The period
* is interpreted as a decimal separator if there is a decimal digit before and after it.
*/
case '.': if (isDecimalSeparator(symbols, i, end)) continue;
case '×': // Fall through
case AbstractUnit.MULTIPLY: next = Operation.MULTIPLY; break;
case '÷':
case '⁄': // Fraction slash
case '/':
case AbstractUnit.DIVIDE: next = Operation.DIVIDE; break;
/*
* If we find an '(' parenthesis, invoke recursively this method for the part inside parenthesis.
* The parsing should end at the ')' parenthesis since it is not a valid unit symbol. If we do not
* find that closing parenthesis, this will be considered an error.
*/
case Style.OPEN: {
final int pos = i + Character.charCount(c);
final ParsePosition sub = new ParsePosition(pos);
final Unit<?> term = parse(symbols, sub);
i = CharSequences.skipLeadingWhitespaces(symbols, sub.getIndex(), end);
if (i >= end || Character.codePointAt(symbols, i) != Style.CLOSE) {
throw new ParserException(Errors.format(Errors.Keys.NonEquilibratedParenthesis_2,
symbols.subSequence(start, i), Style.CLOSE), symbols, start);
}
unit = operation.apply(unit, term, pos);
operation.code = Operation.IMPLICIT; // Default operation if there is no × or / symbols after parenthesis.
start = i + (n = 1); // Skip the number of characters in the '(' Unicode code point.
continue;
}
}
/*
* We reach this point only if we found some operator (division or multiplication).
* If the operator has been found between two digits, we consider it as part of the
* term. For example "m2/3" is considered as a single term where "2/3" is the exponent.
*/
if (i > start && i+n < end
&& Character.isDigit(Character.codePointBefore(symbols, i))
&& Character.isDigit(Character.codePointAt(symbols, i+n)))
{
continue;
}
/*
* At this point, we have either a first unit to parse (NOOP), or a multiplication or division to apply
* between the previously parsed units and the next unit to parse. A special case is IMPLICIT, which is
* a multiplication without explicit × symbol after the parenthesis. The implicit multiplication can be
* overridden by an explicit × or / symbol, which is what happened if we reach this point (tip: look in
* the above 'switch' statement all cases that end with 'break', not 'break scan' or 'continue').
*/
if (operation.code != Operation.IMPLICIT) {
unit = operation.apply(unit, parseTerm(symbols, start, i, operation), start);
}
hasSpaces = false;
operation.code = next;
start = i + n;
}
/*
* At this point we either found an unrecognized character or reached the end of string. We will
* parse the remaining characters as a unit and apply the pending unit operation (multiplication
* or division). But before, we need to check if the parsing should stop at the first whitespace.
* This verification assumes that spaces are allowed only in labels specified by the label(…)
* method and in resource bundles, not in labels specified by AbstractUnit.alternate(String).
*/
Unit<?> component = null;
if (hasSpaces) {
end = i;
start = CharSequences.skipLeadingWhitespaces(symbols, start, i);
search: while ((i = CharSequences.skipTrailingWhitespaces(symbols, start, i)) > start) {
final String uom = symbols.subSequence(start, i).toString();
if ((component = labelToUnit.get(uom)) != null) break;
if ((component = fromName(uom)) != null) break;
int j=i, c;
do {
c = Character.codePointBefore(symbols, j);
j -= Character.charCount(c);
if (j <= start) break search;
} while (!Character.isWhitespace(c));
/*
* Really use Character.isWhitespace(c) above, not Character.isSpaceChar(c), because we want
* to exclude non-breaking spaces. This block should be the only place in UnitFormat class
* where we use isWhitespace(c) instead of isSpaceChar(c).
*/
i = j; // Will become the index of first space after search loop completion.
}
/*
* At this point we did not found any user-specified label or localized name matching the substring.
* Assume that the parsing should stop at the first space, on the basis that spaces are not allowed
* in unit symbols. We make an exception if we detect that the part before the first space contains
* digits (not allowed in unit symbols neither), in which case the substring may be something like
* "100 feet".
*/
if (hasDigit(symbols, start, i)) {
i = end; // Restore the full length (until the first illegal character).
}
}
if (!(operation.finished = (component != null))) {
component = parseTerm(symbols, start, i, operation); // May set 'operation.finished' flag.
}
if (operation.finished) {
finish(position); // For preventing interpretation of "degree minute" as "degree × minute".
}
unit = operation.apply(unit, component, start);
position.setIndex(endOfURI >= 0 ? endOfURI : i);
return unit;
}
/**
* Represents an operation to be applied between two terms parsed by
* {@link UnitFormat#parseTerm(CharSequence, int, int, Operation)}.
*/
private static final class Operation {
/**
* Meaning of some characters parsed by {@link UnitFormat#parse(CharSequence)}.
* The {@code IMPLICIT} case is a multiplication without symbol, which can be
* overridden by an explicit × or / symbol.
*/
static final int NOOP = 0, IMPLICIT = 1, MULTIPLY = 2, DIVIDE = 3, EXPONENT = 4;
/**
* The operation as one of the {@link #NOOP}, {@link #IMPLICIT}, {@link #MULTIPLY}
* or {@link #DIVIDE} values.
*/
int code;
/**
* The symbols being parsed. Used only for formatting error message if needed.
*/
private final CharSequence symbols;
/**
* {@code true} if the parsed terms may be one or more words, possibly containing white spaces.
* In such case, the parsing should not continue after those words.
*
* @see Position#finished
*/
boolean finished;
/**
* Creates an operation initialized to {@link #NOOP}.
*/
Operation(final CharSequence symbols) {
this.symbols = symbols;
}
/**
* Applies a multiplication or division operation between the given units.
*
* @param unit the left operand, which is the unit parsed so far.
* @param term the right operation, which is the newly parsed unit.
* @param position the parse position to report if parsing fail.
*/
Unit<?> apply(final Unit<?> unit, final Unit<?> term, final int position) {
switch (code) {
case NOOP: return term;
case IMPLICIT:
case MULTIPLY: return unit.multiply(term);
case DIVIDE: return unit.divide(term);
case EXPONENT: {
if (UnitDimension.isDimensionless(term.getDimension())) {
final String symbol = term.getSymbol();
if (symbol == null || symbol.isEmpty()) {
final double scale = Units.toStandardUnit(term);
final int power = (int) scale;
if (power == scale) {
return unit.pow(power);
}
}
}
throw new ParserException(Errors.format(Errors.Keys.NotAnInteger_1, term), symbols, position);
}
default: throw new AssertionError(code);
}
}
/**
* If this operation is a multiplication, replaces by division. Otherwise do nothing
* (we do <strong>not</strong> replace division by multiplication). The intent is to
* replace units like "m⋅s-1" by "m/s".
*
* @return whether the operation has been inverted.
*/
boolean invert() {
switch (code) {
case IMPLICIT:
case MULTIPLY: code = DIVIDE; return true;
default: return false;
}
}
}
/**
* Parses a single unit symbol with its exponent.
* The given symbol shall not contain multiplication or division operator except in exponent.
* Parsing of fractional exponent as in "m2/3" is supported; other operations in the exponent
* will cause an exception to be thrown.
*
* @param symbols the complete string specified by the user.
* @param lower index where to begin parsing in the {@code symbols} string.
* @param upper index after the last character to parse in the {@code symbols} string.
* @param operation the operation to be applied (e.g. the term to be parsed is a multiplier or divisor of another unit).
* @return the parsed unit symbol (never {@code null}).
* @throws ParserException if a problem occurred while parsing the given symbols.
*/
@SuppressWarnings("fallthrough")
private Unit<?> parseTerm(final CharSequence symbols, final int lower, final int upper, final Operation operation)
throws ParserException
{
final String uom = CharSequences.trimWhitespaces(symbols, lower, upper).toString();
/*
* Check for labels explicitly given by users. Those labels have precedence over the Apache SIS hard-coded
* symbols. If no explicit label was found, check for symbols and names known to this UnitFormat instance.
*/
Unit<?> unit = labelToUnit.get(uom);
operation.finished = (unit != null);
if (unit == null) {
unit = Prefixes.getUnit(uom);
if (unit == null) {
final int length = uom.length();
if (length == 0) {
return Units.UNITY;
} else {
/*
* If the first character is a digit, presume that the term is a multiplication factor.
* The "*" character is used for raising the number on the left to the power on the right.
* Example: "10*6" is equal to one million. SIS also handles the "^" character as "*".
*
* In principle, spaces are not allowed in unit symbols (in particular, UCUM specifies that
* spaces should not be interpreted as multication operators). However in practice we have
* sometime units written in a form like "100 feet".
*
* If the last character is a super-script, then we assume a notation like "10⁻⁴".
*/
final char c = uom.charAt(0); // No need for code point because next condition is true only for BMP.
if (isDigit(c) || isSign(c)) {
final double multiplier;
try {
int s = uom.indexOf(' ');
if (s >= 0) {
final int next = CharSequences.skipLeadingWhitespaces(uom, s, length);
if (next < length && AbstractUnit.isSymbolChar(uom.codePointAt(next))) {
operation.finished = true; // For preventing attempt to continue parsing after "100 feet".
multiplier = Double.parseDouble(uom.substring(0, s));
return parseTerm(uom, s, length, new Operation(uom)).multiply(multiplier);
}
}
multiplier = parseMultiplicationFactor(uom);
} catch (NumberFormatException e) {
throw (ParserException) new ParserException(Errors.format(
Errors.Keys.UnknownUnit_1, uom), symbols, lower).initCause(e);
}
if (operation.code == Operation.IMPLICIT) {
operation.code = Operation.EXPONENT;
}
return Units.UNITY.multiply(multiplier);
}
}
if (length >= 2) {
/*
* If the symbol ends with a digit (normal script or superscript), presume that this is the unit
* exponent. That exponent can be a Unicode character (only one character in current UnitFormat
* implementation) or a number parseable with Integer.parseInt(String).
*/
Fraction power = null;
int i = length;
int c = uom.codePointBefore(i);
i -= Character.charCount(c);
if (Characters.isSuperScript(c)) {
c = Characters.toNormalScript(c);
if (isDigit(c)) {
power = new Fraction(c - '0', 1);
}
} else if (isDigit(c)) {
while (i != 0) {
c = uom.codePointBefore(i);
final boolean isExponent = isDigit(c) || isDivisor(c);
if (isExponent || isSign(c)) {
i -= Character.charCount(c);
}
if (!isExponent) {
try {
power = new Fraction(uom.substring(i));
} catch (NumberFormatException e) {
// Should never happen unless the number is larger than 'int' capacity.
throw (ParserException) new ParserException(Errors.format(
Errors.Keys.UnknownUnit_1, uom), symbols, lower+i).initCause(e);
}
break;
}
}
}
if (power != null) {
/*
* At this point we have parsed the exponent. Before to parse the raw unit symbol,
* skip the exponent symbol (^, * or **) if any.
*/
i = CharSequences.skipTrailingWhitespaces(uom, 0, i);
if (i != 0) {
// No need for code point because next conditions are true only in BMP.
switch (uom.charAt(i-1)) {
case Style.EXPONENT_OR_MULTIPLY: {
if (i != 1 && uom.charAt(i-2) == Style.EXPONENT_OR_MULTIPLY) i--;
// Fallthrough for skipping the next character and whitespaces.
}
case Style.EXPONENT: {
i = CharSequences.skipTrailingWhitespaces(uom, 0, i - 1);
break;
}
}
}
final String symbol = uom.substring(CharSequences.skipLeadingWhitespaces(uom, 0, i), i);
unit = labelToUnit.get(symbol);
operation.finished = (unit != null);
if (unit == null) {
unit = Prefixes.getUnit(symbol);
}
if (unit != null) {
int numerator = power.numerator;
int denominator = power.denominator;
if (numerator < 0 && operation.invert()) {
numerator = -numerator;
}
if (numerator != 1) unit = unit.pow (numerator);
if (denominator != 1) unit = unit.root(denominator);
return unit;
}
}
}
/*
* At this point, we have determined that the label is not a known unit symbol.
* It may be a unit name, in which case the label is not case-sensitive anymore.
*/
operation.finished = true;
unit = fromName(uom);
if (unit == null) {
if (CharSequences.regionMatches(symbols, lower, UNITY, true)) {
return Units.UNITY;
}
throw new ParserException(Errors.format(Errors.Keys.UnknownUnit_1, uom), symbols, lower);
}
}
}
return unit;
}
/**
* Parses a multiplication factor, which may be a single number or a base raised to an exponent.
* For example all the following strings are equivalent: "1000", "1000.0", "1E3", "10*3", "10^3", "10³".
*/
private static double parseMultiplicationFactor(final String term) throws NumberFormatException {
final String exponent;
int s = term.lastIndexOf(Style.EXPONENT_OR_MULTIPLY); // Check standard UCUM symbol first.
if (s >= 0 || (s = term.lastIndexOf(Style.EXPONENT)) >= 0) {
exponent = term.substring(s + 1);
} else {
s = term.length();
int c = term.codePointBefore(s);
if (!Characters.isSuperScript(c)) {
return Double.parseDouble(term); // No exponent symbol and no superscript found.
}
// Example: "10⁻⁴". Split in base and exponent.
final StringBuilder buffer = new StringBuilder(s);
do {
buffer.appendCodePoint(Characters.toNormalScript(c));
if ((s -= Character.charCount(c)) <= 0) break;
c = term.codePointBefore(s);
} while (Characters.isSuperScript(c));
exponent = buffer.reverse().toString();
}
final int base = Integer.parseInt(term.substring(0, s));
final int exp = Integer.parseInt(exponent);
return (base == 10) ? MathFunctions.pow10(exp) : Math.pow(base, exp);
}
/**
* Parses text from a string to produce a unit. The default implementation delegates to {@link #parse(CharSequence)}
* and wraps the {@link ParserException} into a {@link ParseException} for compatibility with {@code java.text} API.
*
* @param source the text, part of which should be parsed.
* @return a unit parsed from the string.
* @throws ParseException if the given string can not be fully parsed.
*/
@Override
public Object parseObject(final String source) throws ParseException {
try {
return parse(source);
} catch (ParserException e) {
throw (ParseException) new ParseException(e.getLocalizedMessage(), e.getPosition()).initCause(e);
}
}
/**
* Parses text from a string to produce a unit, or returns {@code null} if the parsing failed.
* The default implementation delegates to {@link #parse(CharSequence, ParsePosition)} and catches
* the {@link ParserException}.
*
* @param source the text, part of which should be parsed.
* @param pos index and error index information as described above.
* @return a unit parsed from the string, or {@code null} in case of error.
*/
@Override
public Object parseObject(final String source, final ParsePosition pos) {
try {
return parse(source, pos);
} catch (ParserException e) {
pos.setErrorIndex(e.getPosition());
return null;
}
}
/**
* Returns a clone of this unit format. The new unit format will be initialized to the same
* {@linkplain #getLocale() locale} and {@linkplain #label(Unit, String) labels} than this format.
*
* @return a clone of this unit format.
*/
@Override
public UnitFormat clone() {
final UnitFormat f = (UnitFormat) super.clone();
try {
AccessController.doPrivileged(new FinalFieldSetter<>(UnitFormat.class, "unitToLabel", "labelToUnit"))
.set(f, clone(unitToLabel), clone(labelToUnit));
} catch (ReflectiveOperationException e) {
throw FinalFieldSetter.cloneFailure(e);
}
return f;
}
/**
* Clones the given map, which can be either a {@link HashMap}
* or the instance returned by {@link Collections#emptyMap()}.
*/
private static Object clone(final Map<?,?> value) {
if (value instanceof HashMap<?,?>) {
return ((HashMap<?,?>) value).clone();
} else {
return new HashMap<>();
}
}
}