sis-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From desruisse...@apache.org
Subject [sis] 01/02: Accept, under some condition, spaces as a multiplication symbols (e.g. "kg m**-2"). This is a departure from UCUM specification which does not allow spaces as multiplication symbols. We do this departure only when we think that it is non-ambiguous.
Date Tue, 06 Nov 2018 19:24:29 GMT
This is an automated email from the ASF dual-hosted git repository.

desruisseaux pushed a commit to branch geoapi-4.0
in repository https://gitbox.apache.org/repos/asf/sis.git

commit 34169b0ea1727b100b3cd986679700942089a1d4
Author: Martin Desruisseaux <martin.desruisseaux@geomatys.com>
AuthorDate: Tue Nov 6 12:38:09 2018 +0100

    Accept, under some condition, spaces as a multiplication symbols (e.g. "kg m**-2").
    This is a departure from UCUM specification which does not allow spaces as multiplication
symbols.
    We do this departure only when we think that it is non-ambiguous.
---
 .../java/org/apache/sis/measure/UnitFormat.java    | 87 ++++++++++++++++++----
 .../org/apache/sis/measure/UnitFormatTest.java     | 17 +++++
 2 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java b/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java
index fb18ac4..c2857df 100644
--- a/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java
+++ b/core/sis-utility/src/main/java/org/apache/sis/measure/UnitFormat.java
@@ -268,6 +268,7 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma
      * Units associated to a given label (in addition to the system-wide {@link UnitRegistry}).
      * This map is the converse of {@link #unitToLabel}. The {@link Unit} instances may differ
from the ones
      * specified by user since {@link AbstractUnit#symbol} may have been set to the label
specified by the user.
+     * The labels may contain some characters normally not allowed in unit symbols, like
white spaces.
      *
      * @see #label(Unit, String)
      */
@@ -467,6 +468,7 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma
      * for allowing the singleton {@link #INSTANCE} to parse symbols in a multi-threads environment.</p>
      *
      * @param  uom  the unit symbol, without leading or trailing spaces.
+     * @return the unit for the given name, or {@code null} if unknown.
      */
     private Unit<?> fromName(String uom) {
         /*
@@ -983,6 +985,33 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma
     }
 
     /**
+     * Parse position when text to be parsed is expected to contain nothing else than a unit
symbol.
+     * This is used for recording whether another term (separated from the previous term
by a space)
+     * is allowed or not.
+     */
+    private static final class Position extends ParsePosition {
+        /** {@code true} if we do not expect any more content after the last term parsed.
*/
+        boolean finished;
+
+        /** Creates a new position initialized to the beginning of the text to parse. */
+        Position() {
+            super(0);
+        }
+    }
+
+    /**
+     * Reports that the parsing is finished and no more content should be parsed.
+     * This method is invoked when the last parsed term is possibly one or more words instead
than unit symbols.
+     * The intent is to avoid trying to parse "degree minute" as "degree × minute". By contrast,
this method is
+     * not invoked if the string to parse is "m kg**-2" because it can be interpreted as
"m × kg**-2".
+     */
+    private static void finish(final ParsePosition pos) {
+        if (pos instanceof Position) {
+            ((Position) pos).finished = true;
+        }
+    }
+
+    /**
      * Parses the given text as an instance of {@code Unit}.
      * If the parse completes without reading the entire length of the text, an exception
is thrown.
      *
@@ -991,9 +1020,18 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma
      * The product operator can be either {@code '.'} (ASCII) or {@code '⋅'} (Unicode)
character.
      * Exponent after symbol can be decimal digits as in “m2” or a superscript as in
“m²”.</p>
      *
+     * <p>This method differs from {@link #parse(CharSequence, ParsePosition)} in the
treatment of white spaces:
+     * that method with a {@link ParsePosition} argument stops parsing at the first white
space,
+     * while this {@code parse(…)} method treats white spaces as multiplications.
+     * The reason for this difference is that white space is normally not a valid multiplication
symbol;
+     * it could be followed by a text which is not part of the unit symbol.
+     * But in the case of this {@code parse(CharSequence)} method, the whole {@code CharSequence}
shall be a unit symbol.
+     * In such case, white spaces are less ambiguous.</p>
+     *
      * <p>The default implementation delegates to
      * <code>{@linkplain #parse(CharSequence, ParsePosition) parse}(symbols, new ParsePosition(0))</code>
-     * and verifies that all non-white characters have been parsed.</p>
+     * and verifies that all non-white characters have been parsed.
+     * Units separated by spaces are multiplied; for example "kg m**-2" is parsed as kg/m².</p>
      *
      * @param  symbols  the unit symbols or URI to parse.
      * @return the unit parsed from the specified symbols.
@@ -1003,15 +1041,19 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma
      */
     @Override
     public Unit<?> parse(final CharSequence symbols) throws ParserException {
-        final ParsePosition position = new ParsePosition(0);
-        final Unit<?> unit = parse(symbols, position);
+        final Position position = new Position();
+        Unit<?> unit = parse(symbols, position);
         final int length = symbols.length();
-        final int unrecognized = CharSequences.skipLeadingWhitespaces(symbols, position.getIndex(),
length);
-        if (unrecognized < length) {
-            throw new ParserException(Errors.format(Errors.Keys.UnexpectedCharactersAfter_2,
-                    CharSequences.trimWhitespaces(symbols, 0, unrecognized),
-                    CharSequences.trimWhitespaces(symbols, unrecognized, length)),
-                    symbols, unrecognized);
+        int unrecognized;
+        while ((unrecognized = CharSequences.skipLeadingWhitespaces(symbols, position.getIndex(),
length)) < length) {
+            if (position.finished || !Character.isLetter(Character.codePointAt(symbols, unrecognized)))
{
+                throw new ParserException(Errors.format(Errors.Keys.UnexpectedCharactersAfter_2,
+                        CharSequences.trimWhitespaces(symbols, 0, unrecognized),
+                        CharSequences.trimWhitespaces(symbols, unrecognized, length)),
+                        symbols, unrecognized);
+            }
+            position.setIndex(unrecognized);
+            unit = unit.multiply(parse(symbols, position));
         }
         return unit;
     }
@@ -1062,6 +1104,7 @@ public class UnitFormat extends Format implements javax.measure.format.UnitForma
                     final Unit<?> unit = Units.valueOfEPSG(Integer.parseInt(code));
                     if (unit != null) {
                         position.setIndex(endOfURI);
+                        finish(position);
                         return unit;
                     }
                 } catch (NumberFormatException e) {
@@ -1260,8 +1303,11 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
                 i = end;                        // Restore the full length (until the first
illegal character).
             }
         }
-        if (component == null) {
-            component = parseTerm(symbols, start, i, operation);
+        if (!(operation.finished = (component != null))) {
+            component = parseTerm(symbols, start, i, operation);            // May set 'operation.finished'
flag.
+        }
+        if (operation.finished) {
+            finish(position);           // For preventing interpretation of "degree minute"
as "degree × minute".
         }
         unit = operation.apply(unit, component, start);
         position.setIndex(endOfURI >= 0 ? endOfURI : i);
@@ -1292,6 +1338,14 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
         private final CharSequence symbols;
 
         /**
+         * {@code true} if the parsed terms may be one or more words, possibly containing
white spaces.
+         * In such case, the parsing should not continue after those words.
+         *
+         * @see Position#finished
+         */
+        boolean finished;
+
+        /**
          * Creates an operation initialized to {@link #NOOP}.
          */
         Operation(final CharSequence symbols) {
@@ -1353,8 +1407,7 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
      * @param  symbols    the complete string specified by the user.
      * @param  lower      index where to begin parsing in the {@code symbols} string.
      * @param  upper      index after the last character to parse in the {@code symbols}
string.
-     * @param  operation  if the term will be used as multiplier or divisor of another unit,
the
-     *                    operation to be applied. Otherwise {@code null}.
+     * @param  operation  the operation to be applied (e.g. the term to be parsed is a multiplier
or divisor of another unit).
      * @return the parsed unit symbol (never {@code null}).
      * @throws ParserException if a problem occurred while parsing the given symbols.
      */
@@ -1368,6 +1421,7 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
          * symbols. If no explicit label was found, check for symbols and names known to
this UnitFormat instance.
          */
         Unit<?> unit = labelToUnit.get(uom);
+        operation.finished = (unit != null);
         if (unit == null) {
             unit = Prefixes.getUnit(uom);
             if (unit == null) {
@@ -1394,8 +1448,9 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
                             if (s >= 0) {
                                 final int next = CharSequences.skipLeadingWhitespaces(uom,
s, length);
                                 if (next < length && AbstractUnit.isSymbolChar(uom.codePointAt(next)))
{
+                                    operation.finished = true;  // For preventing attempt
to continue parsing after "100 feet".
                                     multiplier = Double.parseDouble(uom.substring(0, s));
-                                    return parseTerm(uom, s, length, null).multiply(multiplier);
+                                    return parseTerm(uom, s, length, new Operation(uom)).multiply(multiplier);
                                 }
                             }
                             multiplier = parseMultiplicationFactor(uom);
@@ -1464,13 +1519,14 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
                         }
                         final String symbol = uom.substring(CharSequences.skipLeadingWhitespaces(uom,
0, i), i);
                         unit = labelToUnit.get(symbol);
+                        operation.finished = (unit != null);
                         if (unit == null) {
                             unit = Prefixes.getUnit(symbol);
                         }
                         if (unit != null) {
                             int numerator   = power.numerator;
                             int denominator = power.denominator;
-                            if (numerator < 0 && operation != null &&
operation.invert()) {
+                            if (numerator < 0 && operation.invert()) {
                                 numerator = -numerator;
                             }
                             if (numerator   != 1) unit = unit.pow (numerator);
@@ -1483,6 +1539,7 @@ search:     while ((i = CharSequences.skipTrailingWhitespaces(symbols,
start, i)
                  * At this point, we have determined that the label is not a known unit symbol.
                  * It may be a unit name, in which case the label is not case-sensitive anymore.
                  */
+                operation.finished = true;
                 unit = fromName(uom);
                 if (unit == null) {
                     if (CharSequences.regionMatches(symbols, lower, UNITY, true)) {
diff --git a/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java b/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java
index fc711b7..be982bf 100644
--- a/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java
+++ b/core/sis-utility/src/test/java/org/apache/sis/measure/UnitFormatTest.java
@@ -482,6 +482,23 @@ public final strictfp class UnitFormatTest extends TestCase {
     }
 
     /**
+     * Tests parsing of symbols containing terms separated by spaces.
+     * This is valid only when using {@link UnitFormat#parse(CharSequence)}.
+     */
+    public void testParseTermsSeparatedBySpace() {
+        final UnitFormat f = new UnitFormat(Locale.UK);
+        assertSame(Units.METRES_PER_SECOND, f.parse("m s**-1"));
+        try {
+            f.parse("degree minute");
+            fail("Should not accept unknown sentence even if each individual word is known.");
+        } catch (ParserException e) {
+            final String message = e.getMessage();
+            assertTrue(message, message.contains("degree"));
+            assertTrue(message, message.contains("minute"));
+        }
+    }
+
+    /**
      * Tests parsing of symbols composed of terms combined by arithmetic operations (e.g.
"m/s").
      */
     @Test


Mime
View raw message