Improve supplementary char support

Precedence uses codepoint when probing lead char. Scanner accepts supplementary chars in more places, such as op_Supple, Supple"interp", s"$Supple".
scala · Nov 8, 2021 · d59d371 · d59d371
1 parent af79b08
commit d59d371
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 108 deletions.
diff --git a/spec/01-lexical-syntax.md b/spec/01-lexical-syntax.md
@@ -506,7 +506,7 @@ interpolatedString     ::= alphaid ‘"’ {[‘\’] interpolatedStringPart |
 interpolatedStringPart ::= printableChar \ (‘"’ | ‘$’ | ‘\’) | escape
 escape                 ::= ‘$$’
                          | ‘$"’
-                         | ‘$’ id
+                         | ‘$’ alphaid
                          | ‘$’ BlockExpr
 alphaid                ::= upper idrest
                          |  varid
@@ -533,9 +533,9 @@ in an interpolated string. A single ‘$’-sign can still be obtained by doubli
 character: ‘$$’. A single ‘"’-sign can be obtained by the sequence ‘\$"’.
 
 The simpler form consists of a ‘$’-sign followed by an identifier starting with 
-a letter and followed only by letters, digits, and underscore characters, 
-e.g `$id`. The simpler form is expanded by putting braces around the identifier, 
-e.g `$id` is equivalent to `${id}`. In the following, unless we explicitly state otherwise, 
+a letter and followed only by letters, digits, and underscore characters, e.g., `$id`.
+The simpler form is expanded by putting braces around the identifier,
+e.g., `$id` is equivalent to `${id}`. In the following, unless we explicitly state otherwise,
 we assume that this expansion has already been performed.
 
 The expanded expression is type checked normally. Usually, `StringContext` will resolve to 

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
@@ -182,22 +182,26 @@ trait Scanners extends ScannersCommon {
     private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
       isHighSurrogate(high) && {
         var res = false
-        nextChar()
-        val low = ch
+        val low = lookaheadReader.getc()
         if (isLowSurrogate(low)) {
-          nextChar()
-          val codepoint = toCodePoint(high, low)
-          if (isValidCodePoint(codepoint) && test(codepoint)) {
-            putChar(high)
-            putChar(low)
-            res = true
-          } else
-            syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
-        } else if (!strict) {
+          val codePoint = toCodePoint(high, low)
+          if (isValidCodePoint(codePoint)) {
+            if (test(codePoint)) {
+              putChar(high)
+              putChar(low)
+              nextChar()
+              nextChar()
+              res = true
+            }
+          }
+          else syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
+        }
+        else if (!strict) {
           putChar(high)
+          nextChar()
           res = true
-        } else
-          syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
+        }
+        else syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
         res
       }
     private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
@@ -621,8 +625,7 @@ trait Scanners extends ScannersCommon {
           putChar(ch)
           nextChar()
           getIdentRest()
-          if (ch == '"' && token == IDENTIFIER)
-            token = INTERPOLATIONID
+          if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
         case '<' => // is XMLSTART?
           def fetchLT() = {
             val last = if (charOffset >= 2) buf(charOffset - 2) else ' '
@@ -729,12 +732,31 @@ trait Scanners extends ScannersCommon {
             }
             syntaxError(msg)
           }
+          /** Either at closing quote of charlit
+           *  or run the op and take it as a (deprecated) Symbol identifier.
+           */
+          def charLitOrSymbolAfter(op: () => Unit): Unit =
+            if (ch == '\'') {
+              nextChar()
+              token = CHARLIT
+              setStrVal()
+            } else {
+              op()
+              token = SYMBOLLIT
+              strVal = name.toString
+            }
           def fetchSingleQuote() = {
             nextChar()
-            if (isIdentifierStart(ch))
-              charLitOr(() => getIdentRest())
-            else if (isOperatorPart(ch) && (ch != '\\'))
-              charLitOr(() => getOperatorRest())
+            if (isIdentifierStart(ch)) {
+              putChar(ch)
+              nextChar()
+              charLitOrSymbolAfter(() => getIdentRest())
+            }
+            else if (isOperatorPart(ch) && (ch != '\\')) {
+              putChar(ch)
+              nextChar()
+              charLitOrSymbolAfter(() => getOperatorRest())
+            }
             else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
               val isEmptyCharLit = (ch == '\'')
               getLitChar()
@@ -801,12 +823,16 @@ trait Scanners extends ScannersCommon {
               putChar(ch)
               nextChar()
               getIdentRest()
+              if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
             } else if (isSpecial(ch)) {
               putChar(ch)
               nextChar()
               getOperatorRest()
             } else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
               getIdentRest()
+              if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
+            } else if (isSupplementary(ch, isSpecial)) {
+              getOperatorRest()
             } else {
               syntaxError(f"illegal character '\\u$ch%04x'")
               nextChar()
@@ -872,7 +898,8 @@ trait Scanners extends ScannersCommon {
         putChar(ch)
         nextChar()
         getIdentOrOperatorRest()
-      case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
+      case ' ' | LF |   // optimize for common whitespace
+           SU =>        // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
         finishNamed()
       case _ =>
         if (isUnicodeIdentifierPart(ch)) {
@@ -888,6 +915,7 @@ trait Scanners extends ScannersCommon {
 
     @tailrec
     private def getOperatorRest(): Unit = (ch: @switch) match {
+      case ' ' | LF   => finishNamed()          // optimize
       case '~' | '!' | '@' | '#' | '%' |
            '^' | '*' | '+' | '-' | '<' |
            '>' | '?' | ':' | '=' | '&' |
@@ -899,24 +927,12 @@ trait Scanners extends ScannersCommon {
         else { putChar('/'); getOperatorRest() }
       case _ =>
         if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
+        else if (isSupplementary(ch, isSpecial)) getOperatorRest()
         else finishNamed()
     }
 
-    private def getIdentOrOperatorRest(): Unit = {
-      if (isIdentifierPart(ch))
-        getIdentRest()
-      else ch match {
-        case '~' | '!' | '@' | '#' | '%' |
-             '^' | '*' | '+' | '-' | '<' |
-             '>' | '?' | ':' | '=' | '&' |
-             '|' | '\\' | '/' =>
-          getOperatorRest()
-        case _ =>
-          if (isSpecial(ch)) getOperatorRest()
-          else finishNamed()
-      }
-    }
-
+    private def getIdentOrOperatorRest(): Unit =
+      if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()
 
 // Literals -----------------------------------------------------------------
 
@@ -1040,10 +1056,6 @@ trait Scanners extends ScannersCommon {
           getInterpolatedIdentRest()
         } else if (atSupplementary(ch, isUnicodeIdentifierStart)) {
           finishStringPart()
-          putChar(ch)
-          nextRawChar()
-          putChar(ch)
-          nextRawChar()
           getInterpolatedIdentRest()
         } else {
           val expectations = "$$, $\", $identifier or ${expression}"
@@ -1370,23 +1382,6 @@ trait Scanners extends ScannersCommon {
       if (detectedFloat) restOfNonIntegralNumber() else restOfNumber()
     }
 
-    /** Parse character literal if current character is followed by \',
-     *  or follow with given op and return a symbol literal token
-     */
-    def charLitOr(op: () => Unit): Unit = {
-      putChar(ch)
-      nextChar()
-      if (ch == '\'') {
-        nextChar()
-        token = CHARLIT
-        setStrVal()
-      } else {
-        op()
-        token = SYMBOLLIT
-        strVal = name.toString
-      }
-    }
-
 // Errors -----------------------------------------------------------------
 
     /** generate an error at the given offset */

diff --git a/src/reflect/scala/reflect/internal/Chars.scala b/src/reflect/scala/reflect/internal/Chars.scala
@@ -15,10 +15,10 @@ package reflect
 package internal
 
 import scala.annotation.switch
-import java.lang.{ Character => JCharacter }
 
 /** Contains constants and classifier methods for characters */
 trait Chars {
+  import Chars.CodePoint
   // Be very careful touching these.
   // Apparently trivial changes to the way you write these constants
   // will cause Scanners.scala to go from a nice efficient switch to
@@ -72,28 +72,47 @@ trait Chars {
     '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
 
   /** Can character start an alphanumeric Scala identifier? */
-  def isIdentifierStart(c: Char): Boolean =
-    (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
+  def isIdentifierStart(c: Char): Boolean      = (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
+  def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
 
   /** Can character form part of an alphanumeric Scala identifier? */
-  def isIdentifierPart(c: Char) =
-    (c == '$') || Character.isUnicodeIdentifierPart(c)
+  def isIdentifierPart(c: Char)      = (c == '$') || Character.isUnicodeIdentifierPart(c)
+
+  def isIdentifierPart(c: CodePoint) = (c == '$') || Character.isUnicodeIdentifierPart(c)
 
   /** Is character a math or other symbol in Unicode?  */
   def isSpecial(c: Char) = {
     val chtp = Character.getType(c)
     chtp == Character.MATH_SYMBOL.toInt || chtp == Character.OTHER_SYMBOL.toInt
   }
+  def isSpecial(codePoint: CodePoint) = {
+    val chtp = Character.getType(codePoint)
+    chtp == Character.MATH_SYMBOL.toInt || chtp == Character.OTHER_SYMBOL.toInt
+  }
 
+  // used for precedence
+  import Character.{LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER}
   private final val otherLetters = Set[Char]('\u0024', '\u005F')  // '$' and '_'
-  private final val letterGroups = {
-    import JCharacter._
-    Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
-  }
-  def isScalaLetter(ch: Char) = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
+  private final val letterGroups = Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
+  def isScalaLetter(ch: Char) = letterGroups(Character.getType(ch).toByte) || otherLetters(ch)
+  def isScalaLetter(c: CodePoint): Boolean =
+    (Character.getType(c) match {
+      case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
+      case _ => false
+    }) || (c match {
+      case '$' | '_' => true
+      case _ => false
+    })
 
   /** Can character form part of a Scala operator name? */
-  def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
+  def isOperatorPart(c: Char): Boolean = (c: @switch) match {
+    case '~' | '!' | '@' | '#' | '%' |
+         '^' | '*' | '+' | '-' | '<' |
+         '>' | '?' | ':' | '=' | '&' |
+         '|' | '/' | '\\' => true
+    case c => isSpecial(c)
+  }
+  def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
     case '~' | '!' | '@' | '#' | '%' |
          '^' | '*' | '+' | '-' | '<' |
          '>' | '?' | ':' | '=' | '&' |
@@ -102,4 +121,6 @@ trait Chars {
   }
 }
 
-object Chars extends Chars { }
+object Chars extends Chars {
+  type CodePoint = Int
+}
diff --git a/src/reflect/scala/reflect/internal/Precedence.scala b/src/reflect/scala/reflect/internal/Precedence.scala
@@ -10,26 +10,24 @@
  * additional information regarding copyright ownership.
  */
 
-package scala
-package reflect
-package internal
+package scala.reflect.internal
 
 import scala.annotation.switch
-import Chars._
+import Chars.{isOperatorPart, isScalaLetter}
 
 final class Precedence private (val level: Int) extends AnyVal with Ordered[Precedence] {
-  def compare(that: Precedence): Int = level compare that.level
+  def compare(that: Precedence): Int = level.compare(that.level)
   override def toString = s"Precedence($level)"
 }
 
-
 object Precedence extends (Int => Precedence) {
+  type CodePoint = Int
   private[this] val ErrorName = "<error>"
   private def isAssignmentOp(name: String) = name match {
     case "!=" | "<=" | ">=" | "" => false
-    case _                       => name.last == '=' && name.head != '=' && isOperatorPart(name.head)
+    case _                       => name.last == '=' && name.head != '=' && isOperatorPart(name.codePointAt(0))
   }
-  private def firstChar(ch: Char): Precedence = apply((ch: @switch) match {
+  private def firstChar(c: CodePoint): Precedence = apply((c: @switch) match {
     case '|'             => 2
     case '^'             => 3
     case '&'             => 4
@@ -38,13 +36,13 @@ object Precedence extends (Int => Precedence) {
     case ':'             => 7
     case '+' | '-'       => 8
     case '*' | '/' | '%' => 9
-    case _               => if (isScalaLetter(ch)) 1 else 10
+    case _               => if (isScalaLetter(c)) 1 else 10
   })
 
   def apply(level: Int): Precedence = new Precedence(level)
   def apply(name: String): Precedence = name match {
     case "" | ErrorName            => this(-1)
     case _ if isAssignmentOp(name) => this(0)
-    case _                         => firstChar(name charAt 0)
+    case _                         => firstChar(name.codePointAt(0))
   }
 }
diff --git a/test/files/run/t1406.scala b/test/files/run/t1406.scala
@@ -9,8 +9,27 @@ object Test extends DirectTest {
   // \u10428 isLetter and isLowerCase
   def U2 = "\ud801"
   def U3 = "\udc28"
+  // symbol operator So with supplementary char
+  def U4 = "\ud834"
+  def U5 = "\udd97"
+  //\ud83c\udf00   // cyclone 1f300
+  def U6 = "\ud83c"
+  def U7 = "\udf00"
+  // rocket 1f680
+  def U8 = "\ud83d"
+  def U9 = "\ude80"
+  // quintessence 1f700
+  def UA = "\ud83d"
+  def UB = "\udf00"
+
+  //val a_𝓅 = 1
+  // 1d4c5 Mathematical Script Small P
+  // My Pomeranian, Padraig, was nicknamed Little P because of his little p.
+  def UC = "\ud835"
+  def UD = "\udcc5"
+
   def code =
-    s"""class C {
+    s"""class Identifiers {
        |  def x = "$U0"
        |  def y = "$U1"
        |  def `$U0` = x
@@ -23,6 +42,29 @@ object Test extends DirectTest {
        |  def g(x: Any) = x match {
        |    case $U2$U3 @ _ => $U2$U3
        |  }
+       |}
+       |class Ops {
+       |  def $U4$U5 = 42        // was error: illegal character
+       |  def op_$U4$U5 = 42     // was error: illegal character
+       |  def $U6$U7 = 42
+       |  def op_$U6$U7 = 42
+       |  def $U8$U9 = 42
+       |  def op_$U8$U9 = 42
+       |  def $UA$UB = 42
+       |  def op_$UA$UB = 42
+       |  def $UC$UD = 42
+       |  def op_$UC$UD = 42
+       |}
+       |class Strings {
+       |  implicit class Interps(sc: StringContext) {
+       |    def $UC$UD(parts: Any*) = "done"
+       |  }
+       |  def $U4$U5 = 42
+       |  def op_$U4$U5 = 42
+       |  def interpolated = s"$$$$$U4$U5"  // a lot of dollars for little sense
+       |  def interpolated_op = s"$$$$$U4$U5"  // a lot of dollars for little sense
+       |  def e = "a $UC$UD b"
+       |  def f = $UC$UD"one"
        |}""".stripMargin
 
   def show(): Unit = {