Improve support for Unicode supplementary characters in identifiers a…

…nd string interpolation (as in Scala 2) (#16278) Fixes #16271
scala · Dec 29, 2022 · 805dda8 · 805dda8
2 parents 6f5bb34 + 22f11cd
commit 805dda8
Show file tree

Hide file tree

Showing 8 changed files with 146 additions and 64 deletions.
diff --git a/compiler/src/dotty/tools/dotc/core/NameOps.scala b/compiler/src/dotty/tools/dotc/core/NameOps.scala
@@ -86,19 +86,25 @@ object NameOps {
     def isVarPattern: Boolean =
       testSimple { n =>
         n.length > 0 && {
+          def isLowerLetterSupplementary: Boolean =
+            import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
+            isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
+              val codepoint = toCodePoint(n(0), n(1))
+              isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
+            }
           val first = n.head
-          (((first.isLower && first.isLetter) || first == '_')
-            && (n != false_)
-            && (n != true_)
-            && (n != null_))
+          ((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
+            && n != false_
+            && n != true_
+            && n != null_)
         }
       } || name.is(PatMatGivenVarName)
 
     def isOpAssignmentName: Boolean = name match {
       case raw.NE | raw.LE | raw.GE | EMPTY =>
         false
       case name: SimpleName =>
-        name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.head)
+        name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.firstCodePoint)
       case _ =>
         false
     }
@@ -352,6 +358,14 @@ object NameOps {
       val unmangled = kinds.foldLeft(name)(_.unmangle(_))
       if (unmangled eq name) name else unmangled.unmangle(kinds)
     }
+
+    def firstCodePoint: Int =
+      val first = name.firstPart
+      import Character.{isHighSurrogate, isLowSurrogate, isValidCodePoint, toCodePoint}
+      if isHighSurrogate(first(0)) && first.length > 1 && isLowSurrogate(first(1)) then
+        val codepoint = toCodePoint(first(0), first(1))
+        if isValidCodePoint(codepoint) then codepoint else first(0)
+      else first(0)
   }
 
   extension (name: TermName) {

diff --git a/compiler/src/dotty/tools/dotc/core/Names.scala b/compiler/src/dotty/tools/dotc/core/Names.scala
@@ -25,7 +25,7 @@ object Names {
    */
   abstract class Designator
 
-  /** A name if either a term name or a type name. Term names can be simple
+  /** A name is either a term name or a type name. Term names can be simple
    *  or derived. A simple term name is essentially an interned string stored
    *  in a name table. A derived term name adds a tag, and possibly a number
    *  or a further simple name to some other name.

diff --git a/compiler/src/dotty/tools/dotc/parsing/Scanners.scala b/compiler/src/dotty/tools/dotc/parsing/Scanners.scala
@@ -21,6 +21,8 @@ import config.Feature.{migrateTo3, fewerBracesEnabled}
 import config.SourceVersion.`3.0`
 import reporting.{NoProfile, Profile, Message}
 
+import java.util.Objects
+
 object Scanners {
 
   /** Offset into source character array */
@@ -777,19 +779,21 @@ object Scanners {
     private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
       isHighSurrogate(high) && {
         var res = false
-        nextChar()
-        val low = ch
+        val low = lookaheadChar()
         if isLowSurrogate(low) then
-          nextChar()
           val codepoint = toCodePoint(high, low)
-          if isValidCodePoint(codepoint) && test(codepoint) then
-            putChar(high)
-            putChar(low)
-            res = true
+          if isValidCodePoint(codepoint) then
+            if test(codepoint) then
+              putChar(high)
+              putChar(low)
+              nextChar()
+              nextChar()
+              res = true
           else
             error(em"illegal character '${toUnicode(high)}${toUnicode(low)}'")
         else if !strict then
           putChar(high)
+          nextChar()
           res = true
         else
           error(em"illegal character '${toUnicode(high)}' missing low surrogate")
@@ -889,7 +893,6 @@ object Scanners {
               if (ch == '\"') {
                 if (lookaheadChar() == '\"') {
                   nextRawChar()
-                  //offset += 3   // first part is positioned at the quote
                   nextRawChar()
                   stringPart(multiLine = true)
                 }
@@ -900,7 +903,6 @@ object Scanners {
                 }
               }
               else {
-                //offset += 1   // first part is positioned at the quote
                 stringPart(multiLine = false)
               }
             }
@@ -977,30 +979,29 @@ object Scanners {
           }
         case _ =>
           def fetchOther() =
-            if (ch == '\u21D2') {
+            if ch == '\u21D2' then
               nextChar(); token = ARROW
               report.deprecationWarning(em"The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
-            }
-            else if (ch == '\u2190') {
+            else if ch == '\u2190' then
               nextChar(); token = LARROW
               report.deprecationWarning(em"The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
-            }
-            else if (Character.isUnicodeIdentifierStart(ch)) {
+            else if isUnicodeIdentifierStart(ch) then
               putChar(ch)
               nextChar()
               getIdentRest()
-            }
-            else if (isSpecial(ch)) {
+              if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
+            else if isSpecial(ch) then
               putChar(ch)
               nextChar()
               getOperatorRest()
-            }
             else if isSupplementary(ch, isUnicodeIdentifierStart) then
               getIdentRest()
-            else {
+              if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
+            else if isSupplementary(ch, isSpecial) then
+              getOperatorRest()
+            else
               error(em"illegal character '${toUnicode(ch)}'")
               nextChar()
-            }
           fetchOther()
       }
     }
@@ -1115,7 +1116,7 @@ object Scanners {
       else error(em"unclosed quoted identifier")
     }
 
-    private def getIdentRest(): Unit = (ch: @switch) match {
+    @tailrec private def getIdentRest(): Unit = (ch: @switch) match {
       case 'A' | 'B' | 'C' | 'D' | 'E' |
            'F' | 'G' | 'H' | 'I' | 'J' |
            'K' | 'L' | 'M' | 'N' | 'O' |
@@ -1150,7 +1151,7 @@ object Scanners {
           finishNamed()
     }
 
-    private def getOperatorRest(): Unit = (ch: @switch) match {
+    @tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
       case '~' | '!' | '@' | '#' | '%' |
            '^' | '*' | '+' | '-' | '<' |
            '>' | '?' | ':' | '=' | '&' |
@@ -1161,23 +1162,13 @@ object Scanners {
         if nxch == '/' || nxch == '*' then finishNamed()
         else { putChar(ch); nextChar(); getOperatorRest() }
       case _ =>
-        if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
+        if isSpecial(ch) then { putChar(ch); nextChar(); getOperatorRest() }
+        else if isSupplementary(ch, isSpecial) then getOperatorRest()
         else finishNamed()
     }
 
     private def getIdentOrOperatorRest(): Unit =
-      if (isIdentifierPart(ch))
-        getIdentRest()
-      else ch match {
-        case '~' | '!' | '@' | '#' | '%' |
-             '^' | '*' | '+' | '-' | '<' |
-             '>' | '?' | ':' | '=' | '&' |
-             '|' | '\\' | '/' =>
-          getOperatorRest()
-        case _ =>
-          if (isSpecial(ch)) getOperatorRest()
-          else finishNamed()
-      }
+      if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()
 
     def isSoftModifier: Boolean =
       token == IDENTIFIER
@@ -1500,7 +1491,7 @@ object Scanners {
       if (ch == '\'') finishCharLit()
       else {
         token = op
-        strVal = if (name != null) name.toString else null
+        strVal = Objects.toString(name)
         litBuf.clear()
       }
     }

diff --git a/compiler/src/dotty/tools/dotc/parsing/package.scala b/compiler/src/dotty/tools/dotc/parsing/package.scala
@@ -17,7 +17,7 @@ package object parsing {
   def precedence(operator: Name): Int =
     if (operator eq nme.ERROR) -1
     else {
-      val firstCh = operator.firstPart.head
+      val firstCh = operator.firstCodePoint
       if (isScalaLetter(firstCh)) 1
       else if (operator.isOpAssignmentName) 0
       else firstCh match {

diff --git a/compiler/src/dotty/tools/dotc/util/Chars.scala b/compiler/src/dotty/tools/dotc/util/Chars.scala
@@ -1,21 +1,20 @@
 package dotty.tools.dotc.util
 
 import scala.annotation.switch
-import java.lang.{Character => JCharacter}
-import java.lang.Character.LETTER_NUMBER
-import java.lang.Character.LOWERCASE_LETTER
-import java.lang.Character.OTHER_LETTER
-import java.lang.Character.TITLECASE_LETTER
-import java.lang.Character.UPPERCASE_LETTER
+import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
+import Character.{MATH_SYMBOL, OTHER_SYMBOL}
+import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}
 
 /** Contains constants and classifier methods for characters */
-object Chars {
+object Chars:
 
   inline val LF = '\u000A'
   inline val FF = '\u000C'
   inline val CR = '\u000D'
   inline val SU = '\u001A'
 
+  type CodePoint = Int
+
   /** Convert a character digit to an Int according to given base,
     *  -1 if no success
     */
@@ -59,17 +58,21 @@ object Chars {
     '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
 
   /** Can character start an alphanumeric Scala identifier? */
-  def isIdentifierStart(c: Char): Boolean =
-    (c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
+  def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
+  def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
 
   /** Can character form part of an alphanumeric Scala identifier? */
-  def isIdentifierPart(c: Char): Boolean =
-    (c == '$') || JCharacter.isUnicodeIdentifierPart(c)
+  def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
+  def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)
 
   /** Is character a math or other symbol in Unicode?  */
   def isSpecial(c: Char): Boolean = {
-    val chtp = JCharacter.getType(c)
-    chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
+    val chtp = Character.getType(c)
+    chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
+  }
+  def isSpecial(codePoint: CodePoint) = {
+    val chtp = Character.getType(codePoint)
+    chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
   }
 
   def isValidJVMChar(c: Char): Boolean =
@@ -78,15 +81,26 @@ object Chars {
   def isValidJVMMethodChar(c: Char): Boolean =
     !(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')
 
-  private final val otherLetters = Set[Char]('\u0024', '\u005F')  // '$' and '_'
-  private final val letterGroups = {
-    import JCharacter._
-    Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
-  }
-  def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
+  def isScalaLetter(c: Char): Boolean =
+    Character.getType(c: @switch) match {
+      case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
+      case _ => c == '$' || c == '_'
+    }
+  def isScalaLetter(c: CodePoint): Boolean =
+    Character.getType(c: @switch) match {
+      case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
+      case _ => c == '$' || c == '_'
+    }
 
   /** Can character form part of a Scala operator name? */
-  def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
+  def isOperatorPart(c: Char): Boolean = (c: @switch) match {
+    case '~' | '!' | '@' | '#' | '%' |
+         '^' | '*' | '+' | '-' | '<' |
+         '>' | '?' | ':' | '=' | '&' |
+         '|' | '/' | '\\' => true
+    case c => isSpecial(c)
+  }
+  def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
     case '~' | '!' | '@' | '#' | '%' |
          '^' | '*' | '+' | '-' | '<' |
          '>' | '?' | ':' | '=' | '&' |
@@ -95,5 +109,4 @@ object Chars {
   }
 
   /** Would the character be encoded by `NameTransformer.encode`? */
-  def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
-}
+  def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)
diff --git a/tests/pos/surrogates.scala b/tests/pos/surrogates.scala
@@ -25,4 +25,8 @@ class Construction {
   def reversed = "xyz\udc00\ud801abc"
 }
 
+class Demon {
+  val 😈 = 42
+}
+
 // was: error: illegal character '\ud801', '\udc00'
diff --git a/tests/pos/t1406.scala b/tests/pos/t1406.scala
@@ -0,0 +1,32 @@
+
+class Identifiers {
+
+  def f(x: Any): Boolean = x match {
+    case 𐐨XYZ: String => true
+    case 𐐨                => true
+  }
+  def g(x: Any) = x match {
+    case 𐐨 @ _ => 𐐨
+  }
+}
+class Ops {
+  def 𝆗 = 42        // was error: illegal character
+  def op_𝆗 = 42     // was error: illegal character
+  def 🌀 = 42
+  def op_🌀 = 42
+  def 🚀 = 42
+  def op_🚀 = 42
+  def 🜀 = 42
+  def op_🜀 = 42
+  def 𝓅 = 42
+  def op_𝓅 = 42
+}
+class Strings {
+  implicit class Interps(sc: StringContext) {
+    def 𝓅(parts: Any*) = "done"
+  }
+  def 𝓅 = 42
+  def interpolated = s"$𝓅"
+  def e = "a 𝓅 b"
+  def f = 𝓅"one"
+}
diff --git a/tests/run/t1406b.scala b/tests/run/t1406b.scala
@@ -0,0 +1,28 @@
+
+case class C(n: Int) {
+  def 𐀀(c: C): C = C(n * c.n)   // actually a letter but supplementary 0x10000
+  def ☀(c: C): C = C(n * c.n)   // just a symbol
+  def ☀=(c: C): C = C(n * c.n)   // just a symbol
+  def 🌀(c: C): C = C(n * c.n)  // cyclone operator is symbol, supplementary
+  def 🌀=(c: C): C = C(n * c.n)  // cyclone operator is symbol, supplementary
+  def *(c: C): C = C(n * c.n)
+  def +(c: C): C = C(n + c.n)
+}
+object Test extends App {
+  val Sum = 84
+  val Product = 1764
+  val ProductSum = 1806
+  val SumProduct = 3528
+  val c, d = C(42)
+  def assertEquals(expected: Int, actual: C) = assert(expected == actual.n)
+  assertEquals(Sum, c + d)
+  assertEquals(Product, c * d)
+  assertEquals(Product, c ☀ d)
+  assertEquals(ProductSum, c * d + d)
+  assertEquals(ProductSum, c ☀ d + d)
+  assertEquals(SumProduct, c ☀= d + d)           // assignment op is low precedence
+  assertEquals(SumProduct, c 𐀀 d + d)            // the first one, letter should be low precedence
+  assertEquals(ProductSum, c 🌀d + d)            // the second one, cyclone should be high precedence
+  assertEquals(SumProduct, c 🌀= d + d)          // assignment op is low precedence
+}
+