Skip to content

Commit

Permalink
Improve support for Unicode supplementary characters in identifiers a…
Browse files Browse the repository at this point in the history
…nd string interpolation (as in Scala 2) (#16278)

Fixes #16271
  • Loading branch information
SethTisue committed Dec 29, 2022
2 parents 6f5bb34 + 22f11cd commit 805dda8
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 64 deletions.
24 changes: 19 additions & 5 deletions compiler/src/dotty/tools/dotc/core/NameOps.scala
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,25 @@ object NameOps {
def isVarPattern: Boolean =
testSimple { n =>
n.length > 0 && {
def isLowerLetterSupplementary: Boolean =
import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
val codepoint = toCodePoint(n(0), n(1))
isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
}
val first = n.head
(((first.isLower && first.isLetter) || first == '_')
&& (n != false_)
&& (n != true_)
&& (n != null_))
((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
&& n != false_
&& n != true_
&& n != null_)
}
} || name.is(PatMatGivenVarName)

def isOpAssignmentName: Boolean = name match {
case raw.NE | raw.LE | raw.GE | EMPTY =>
false
case name: SimpleName =>
name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.head)
name.length > 0 && name.last == '=' && name.head != '=' && isOperatorPart(name.firstCodePoint)
case _ =>
false
}
Expand Down Expand Up @@ -352,6 +358,14 @@ object NameOps {
val unmangled = kinds.foldLeft(name)(_.unmangle(_))
if (unmangled eq name) name else unmangled.unmangle(kinds)
}

def firstCodePoint: Int =
val first = name.firstPart
import Character.{isHighSurrogate, isLowSurrogate, isValidCodePoint, toCodePoint}
if isHighSurrogate(first(0)) && first.length > 1 && isLowSurrogate(first(1)) then
val codepoint = toCodePoint(first(0), first(1))
if isValidCodePoint(codepoint) then codepoint else first(0)
else first(0)
}

extension (name: TermName) {
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/dotty/tools/dotc/core/Names.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ object Names {
*/
abstract class Designator

/** A name if either a term name or a type name. Term names can be simple
/** A name is either a term name or a type name. Term names can be simple
* or derived. A simple term name is essentially an interned string stored
* in a name table. A derived term name adds a tag, and possibly a number
* or a further simple name to some other name.
Expand Down
61 changes: 26 additions & 35 deletions compiler/src/dotty/tools/dotc/parsing/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import config.Feature.{migrateTo3, fewerBracesEnabled}
import config.SourceVersion.`3.0`
import reporting.{NoProfile, Profile, Message}

import java.util.Objects

object Scanners {

/** Offset into source character array */
Expand Down Expand Up @@ -777,19 +779,21 @@ object Scanners {
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
val low = lookaheadChar()
if isLowSurrogate(low) then
nextChar()
val codepoint = toCodePoint(high, low)
if isValidCodePoint(codepoint) && test(codepoint) then
putChar(high)
putChar(low)
res = true
if isValidCodePoint(codepoint) then
if test(codepoint) then
putChar(high)
putChar(low)
nextChar()
nextChar()
res = true
else
error(em"illegal character '${toUnicode(high)}${toUnicode(low)}'")
else if !strict then
putChar(high)
nextChar()
res = true
else
error(em"illegal character '${toUnicode(high)}' missing low surrogate")
Expand Down Expand Up @@ -889,7 +893,6 @@ object Scanners {
if (ch == '\"') {
if (lookaheadChar() == '\"') {
nextRawChar()
//offset += 3 // first part is positioned at the quote
nextRawChar()
stringPart(multiLine = true)
}
Expand All @@ -900,7 +903,6 @@ object Scanners {
}
}
else {
//offset += 1 // first part is positioned at the quote
stringPart(multiLine = false)
}
}
Expand Down Expand Up @@ -977,30 +979,29 @@ object Scanners {
}
case _ =>
def fetchOther() =
if (ch == '\u21D2') {
if ch == '\u21D2' then
nextChar(); token = ARROW
report.deprecationWarning(em"The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (ch == '\u2190') {
else if ch == '\u2190' then
nextChar(); token = LARROW
report.deprecationWarning(em"The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (Character.isUnicodeIdentifierStart(ch)) {
else if isUnicodeIdentifierStart(ch) then
putChar(ch)
nextChar()
getIdentRest()
}
else if (isSpecial(ch)) {
if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
else if isSpecial(ch) then
putChar(ch)
nextChar()
getOperatorRest()
}
else if isSupplementary(ch, isUnicodeIdentifierStart) then
getIdentRest()
else {
if ch == '"' && token == IDENTIFIER then token = INTERPOLATIONID
else if isSupplementary(ch, isSpecial) then
getOperatorRest()
else
error(em"illegal character '${toUnicode(ch)}'")
nextChar()
}
fetchOther()
}
}
Expand Down Expand Up @@ -1115,7 +1116,7 @@ object Scanners {
else error(em"unclosed quoted identifier")
}

private def getIdentRest(): Unit = (ch: @switch) match {
@tailrec private def getIdentRest(): Unit = (ch: @switch) match {
case 'A' | 'B' | 'C' | 'D' | 'E' |
'F' | 'G' | 'H' | 'I' | 'J' |
'K' | 'L' | 'M' | 'N' | 'O' |
Expand Down Expand Up @@ -1150,7 +1151,7 @@ object Scanners {
finishNamed()
}

private def getOperatorRest(): Unit = (ch: @switch) match {
@tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -1161,23 +1162,13 @@ object Scanners {
if nxch == '/' || nxch == '*' then finishNamed()
else { putChar(ch); nextChar(); getOperatorRest() }
case _ =>
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
if isSpecial(ch) then { putChar(ch); nextChar(); getOperatorRest() }
else if isSupplementary(ch, isSpecial) then getOperatorRest()
else finishNamed()
}

private def getIdentOrOperatorRest(): Unit =
if (isIdentifierPart(ch))
getIdentRest()
else ch match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '\\' | '/' =>
getOperatorRest()
case _ =>
if (isSpecial(ch)) getOperatorRest()
else finishNamed()
}
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()

def isSoftModifier: Boolean =
token == IDENTIFIER
Expand Down Expand Up @@ -1500,7 +1491,7 @@ object Scanners {
if (ch == '\'') finishCharLit()
else {
token = op
strVal = if (name != null) name.toString else null
strVal = Objects.toString(name)
litBuf.clear()
}
}
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/dotty/tools/dotc/parsing/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ package object parsing {
def precedence(operator: Name): Int =
if (operator eq nme.ERROR) -1
else {
val firstCh = operator.firstPart.head
val firstCh = operator.firstCodePoint
if (isScalaLetter(firstCh)) 1
else if (operator.isOpAssignmentName) 0
else firstCh match {
Expand Down
57 changes: 35 additions & 22 deletions compiler/src/dotty/tools/dotc/util/Chars.scala
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
package dotty.tools.dotc.util

import scala.annotation.switch
import java.lang.{Character => JCharacter}
import java.lang.Character.LETTER_NUMBER
import java.lang.Character.LOWERCASE_LETTER
import java.lang.Character.OTHER_LETTER
import java.lang.Character.TITLECASE_LETTER
import java.lang.Character.UPPERCASE_LETTER
import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
import Character.{MATH_SYMBOL, OTHER_SYMBOL}
import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}

/** Contains constants and classifier methods for characters */
object Chars {
object Chars:

inline val LF = '\u000A'
inline val FF = '\u000C'
inline val CR = '\u000D'
inline val SU = '\u001A'

type CodePoint = Int

/** Convert a character digit to an Int according to given base,
* -1 if no success
*/
Expand Down Expand Up @@ -59,17 +58,21 @@ object Chars {
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'

/** Can character start an alphanumeric Scala identifier? */
def isIdentifierStart(c: Char): Boolean =
(c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)

/** Can character form part of an alphanumeric Scala identifier? */
def isIdentifierPart(c: Char): Boolean =
(c == '$') || JCharacter.isUnicodeIdentifierPart(c)
def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)

/** Is character a math or other symbol in Unicode? */
def isSpecial(c: Char): Boolean = {
val chtp = JCharacter.getType(c)
chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
val chtp = Character.getType(c)
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
}
def isSpecial(codePoint: CodePoint) = {
val chtp = Character.getType(codePoint)
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
}

def isValidJVMChar(c: Char): Boolean =
Expand All @@ -78,15 +81,26 @@ object Chars {
def isValidJVMMethodChar(c: Char): Boolean =
!(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')

private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
private final val letterGroups = {
import JCharacter._
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
}
def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
def isScalaLetter(c: Char): Boolean =
Character.getType(c: @switch) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => c == '$' || c == '_'
}
def isScalaLetter(c: CodePoint): Boolean =
Character.getType(c: @switch) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => c == '$' || c == '_'
}

/** Can character form part of a Scala operator name? */
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '/' | '\\' => true
case c => isSpecial(c)
}
def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -95,5 +109,4 @@ object Chars {
}

/** Would the character be encoded by `NameTransformer.encode`? */
def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
}
def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)
4 changes: 4 additions & 0 deletions tests/pos/surrogates.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@ class Construction {
def reversed = "xyz\udc00\ud801abc"
}

class Demon {
val 😈 = 42
}

// was: error: illegal character '\ud801', '\udc00'
32 changes: 32 additions & 0 deletions tests/pos/t1406.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

class Identifiers {

def f(x: Any): Boolean = x match {
case 𐐨XYZ: String => true
case 𐐨 => true
}
def g(x: Any) = x match {
case 𐐨 @ _ => 𐐨
}
}
class Ops {
def 𝆗 = 42 // was error: illegal character
def op_𝆗 = 42 // was error: illegal character
def 🌀 = 42
def op_🌀 = 42
def 🚀 = 42
def op_🚀 = 42
def 🜀 = 42
def op_🜀 = 42
def 𝓅 = 42
def op_𝓅 = 42
}
class Strings {
implicit class Interps(sc: StringContext) {
def 𝓅(parts: Any*) = "done"
}
def 𝓅 = 42
def interpolated = s"$𝓅"
def e = "a 𝓅 b"
def f = 𝓅"one"
}
28 changes: 28 additions & 0 deletions tests/run/t1406b.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

case class C(n: Int) {
def 𐀀(c: C): C = C(n * c.n) // actually a letter but supplementary 0x10000
def (c: C): C = C(n * c.n) // just a symbol
def ☀=(c: C): C = C(n * c.n) // just a symbol
def 🌀(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
def 🌀=(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
def *(c: C): C = C(n * c.n)
def +(c: C): C = C(n + c.n)
}
object Test extends App {
val Sum = 84
val Product = 1764
val ProductSum = 1806
val SumProduct = 3528
val c, d = C(42)
def assertEquals(expected: Int, actual: C) = assert(expected == actual.n)
assertEquals(Sum, c + d)
assertEquals(Product, c * d)
assertEquals(Product, c ☀ d)
assertEquals(ProductSum, c * d + d)
assertEquals(ProductSum, c ☀ d + d)
assertEquals(SumProduct, c ☀= d + d) // assignment op is low precedence
assertEquals(SumProduct, c 𐀀 d + d) // the first one, letter should be low precedence
assertEquals(ProductSum, c 🌀d + d) // the second one, cyclone should be high precedence
assertEquals(SumProduct, c 🌀= d + d) // assignment op is low precedence
}

0 comments on commit 805dda8

Please sign in to comment.