Skip to content

Commit

Permalink
Improve supplementary char support
Browse files Browse the repository at this point in the history
Precedence uses codepoint when probing lead char.

Scanner accepts supplementary chars in more places,
such as op_Supple, Supple"interp", s"$Supple".
  • Loading branch information
som-snytt committed Nov 8, 2021
1 parent af79b08 commit d59d371
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 108 deletions.
8 changes: 4 additions & 4 deletions spec/01-lexical-syntax.md
Expand Up @@ -506,7 +506,7 @@ interpolatedString ::= alphaid ‘"’ {[‘\’] interpolatedStringPart |
interpolatedStringPart ::= printableChar \ (‘"’ | ‘$’ | ‘\’) | escape
escape ::= ‘$$’
| ‘$"’
| ‘$’ id
| ‘$’ alphaid
| ‘$’ BlockExpr
alphaid ::= upper idrest
| varid
Expand All @@ -533,9 +533,9 @@ in an interpolated string. A single ‘$’-sign can still be obtained by doubli
character: ‘$$’. A single ‘"’-sign can be obtained by the sequence ‘\$"’.

The simpler form consists of a ‘$’-sign followed by an identifier starting with
a letter and followed only by letters, digits, and underscore characters,
e.g `$id`. The simpler form is expanded by putting braces around the identifier,
e.g `$id` is equivalent to `${id}`. In the following, unless we explicitly state otherwise,
a letter and followed only by letters, digits, and underscore characters, e.g., `$id`.
The simpler form is expanded by putting braces around the identifier,
e.g., `$id` is equivalent to `${id}`. In the following, unless we explicitly state otherwise,
we assume that this expansion has already been performed.

The expanded expression is type checked normally. Usually, `StringContext` will resolve to
Expand Down
107 changes: 51 additions & 56 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Expand Up @@ -182,22 +182,26 @@ trait Scanners extends ScannersCommon {
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
val low = lookaheadReader.getc()
if (isLowSurrogate(low)) {
nextChar()
val codepoint = toCodePoint(high, low)
if (isValidCodePoint(codepoint) && test(codepoint)) {
putChar(high)
putChar(low)
res = true
} else
syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
} else if (!strict) {
val codePoint = toCodePoint(high, low)
if (isValidCodePoint(codePoint)) {
if (test(codePoint)) {
putChar(high)
putChar(low)
nextChar()
nextChar()
res = true
}
}
else syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
}
else if (!strict) {
putChar(high)
nextChar()
res = true
} else
syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
}
else syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
res
}
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
Expand Down Expand Up @@ -621,8 +625,7 @@ trait Scanners extends ScannersCommon {
putChar(ch)
nextChar()
getIdentRest()
if (ch == '"' && token == IDENTIFIER)
token = INTERPOLATIONID
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
case '<' => // is XMLSTART?
def fetchLT() = {
val last = if (charOffset >= 2) buf(charOffset - 2) else ' '
Expand Down Expand Up @@ -729,12 +732,31 @@ trait Scanners extends ScannersCommon {
}
syntaxError(msg)
}
/** Either at closing quote of charlit
* or run the op and take it as a (deprecated) Symbol identifier.
*/
def charLitOrSymbolAfter(op: () => Unit): Unit =
if (ch == '\'') {
nextChar()
token = CHARLIT
setStrVal()
} else {
op()
token = SYMBOLLIT
strVal = name.toString
}
def fetchSingleQuote() = {
nextChar()
if (isIdentifierStart(ch))
charLitOr(() => getIdentRest())
else if (isOperatorPart(ch) && (ch != '\\'))
charLitOr(() => getOperatorRest())
if (isIdentifierStart(ch)) {
putChar(ch)
nextChar()
charLitOrSymbolAfter(() => getIdentRest())
}
else if (isOperatorPart(ch) && (ch != '\\')) {
putChar(ch)
nextChar()
charLitOrSymbolAfter(() => getOperatorRest())
}
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
val isEmptyCharLit = (ch == '\'')
getLitChar()
Expand Down Expand Up @@ -801,12 +823,16 @@ trait Scanners extends ScannersCommon {
putChar(ch)
nextChar()
getIdentRest()
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
} else if (isSpecial(ch)) {
putChar(ch)
nextChar()
getOperatorRest()
} else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
getIdentRest()
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
} else if (isSupplementary(ch, isSpecial)) {
getOperatorRest()
} else {
syntaxError(f"illegal character '\\u$ch%04x'")
nextChar()
Expand Down Expand Up @@ -872,7 +898,8 @@ trait Scanners extends ScannersCommon {
putChar(ch)
nextChar()
getIdentOrOperatorRest()
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
case ' ' | LF | // optimize for common whitespace
SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (isUnicodeIdentifierPart(ch)) {
Expand All @@ -888,6 +915,7 @@ trait Scanners extends ScannersCommon {

@tailrec
private def getOperatorRest(): Unit = (ch: @switch) match {
case ' ' | LF => finishNamed() // optimize
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -899,24 +927,12 @@ trait Scanners extends ScannersCommon {
else { putChar('/'); getOperatorRest() }
case _ =>
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
else if (isSupplementary(ch, isSpecial)) getOperatorRest()
else finishNamed()
}

private def getIdentOrOperatorRest(): Unit = {
if (isIdentifierPart(ch))
getIdentRest()
else ch match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '\\' | '/' =>
getOperatorRest()
case _ =>
if (isSpecial(ch)) getOperatorRest()
else finishNamed()
}
}

private def getIdentOrOperatorRest(): Unit =
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()

// Literals -----------------------------------------------------------------

Expand Down Expand Up @@ -1040,10 +1056,6 @@ trait Scanners extends ScannersCommon {
getInterpolatedIdentRest()
} else if (atSupplementary(ch, isUnicodeIdentifierStart)) {
finishStringPart()
putChar(ch)
nextRawChar()
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
val expectations = "$$, $\", $identifier or ${expression}"
Expand Down Expand Up @@ -1370,23 +1382,6 @@ trait Scanners extends ScannersCommon {
if (detectedFloat) restOfNonIntegralNumber() else restOfNumber()
}

/** Parse character literal if current character is followed by \',
* or follow with given op and return a symbol literal token
*/
def charLitOr(op: () => Unit): Unit = {
putChar(ch)
nextChar()
if (ch == '\'') {
nextChar()
token = CHARLIT
setStrVal()
} else {
op()
token = SYMBOLLIT
strVal = name.toString
}
}

// Errors -----------------------------------------------------------------

/** generate an error at the given offset */
Expand Down
45 changes: 33 additions & 12 deletions src/reflect/scala/reflect/internal/Chars.scala
Expand Up @@ -15,10 +15,10 @@ package reflect
package internal

import scala.annotation.switch
import java.lang.{ Character => JCharacter }

/** Contains constants and classifier methods for characters */
trait Chars {
import Chars.CodePoint
// Be very careful touching these.
// Apparently trivial changes to the way you write these constants
// will cause Scanners.scala to go from a nice efficient switch to
Expand Down Expand Up @@ -72,28 +72,47 @@ trait Chars {
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'

/** Can character start an alphanumeric Scala identifier? */
def isIdentifierStart(c: Char): Boolean =
(c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)

/** Can character form part of an alphanumeric Scala identifier? */
def isIdentifierPart(c: Char) =
(c == '$') || Character.isUnicodeIdentifierPart(c)
def isIdentifierPart(c: Char) = (c == '$') || Character.isUnicodeIdentifierPart(c)

def isIdentifierPart(c: CodePoint) = (c == '$') || Character.isUnicodeIdentifierPart(c)

/** Is character a math or other symbol in Unicode? */
def isSpecial(c: Char) = {
val chtp = Character.getType(c)
chtp == Character.MATH_SYMBOL.toInt || chtp == Character.OTHER_SYMBOL.toInt
}
def isSpecial(codePoint: CodePoint) = {
val chtp = Character.getType(codePoint)
chtp == Character.MATH_SYMBOL.toInt || chtp == Character.OTHER_SYMBOL.toInt
}

// used for precedence
import Character.{LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER}
private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
private final val letterGroups = {
import JCharacter._
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
}
def isScalaLetter(ch: Char) = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
private final val letterGroups = Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
def isScalaLetter(ch: Char) = letterGroups(Character.getType(ch).toByte) || otherLetters(ch)
def isScalaLetter(c: CodePoint): Boolean =
(Character.getType(c) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => false
}) || (c match {
case '$' | '_' => true
case _ => false
})

/** Can character form part of a Scala operator name? */
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '/' | '\\' => true
case c => isSpecial(c)
}
def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -102,4 +121,6 @@ trait Chars {
}
}

object Chars extends Chars { }
object Chars extends Chars {
type CodePoint = Int
}
18 changes: 8 additions & 10 deletions src/reflect/scala/reflect/internal/Precedence.scala
Expand Up @@ -10,26 +10,24 @@
* additional information regarding copyright ownership.
*/

package scala
package reflect
package internal
package scala.reflect.internal

import scala.annotation.switch
import Chars._
import Chars.{isOperatorPart, isScalaLetter}

final class Precedence private (val level: Int) extends AnyVal with Ordered[Precedence] {
def compare(that: Precedence): Int = level compare that.level
def compare(that: Precedence): Int = level.compare(that.level)
override def toString = s"Precedence($level)"
}


object Precedence extends (Int => Precedence) {
type CodePoint = Int
private[this] val ErrorName = "<error>"
private def isAssignmentOp(name: String) = name match {
case "!=" | "<=" | ">=" | "" => false
case _ => name.last == '=' && name.head != '=' && isOperatorPart(name.head)
case _ => name.last == '=' && name.head != '=' && isOperatorPart(name.codePointAt(0))
}
private def firstChar(ch: Char): Precedence = apply((ch: @switch) match {
private def firstChar(c: CodePoint): Precedence = apply((c: @switch) match {
case '|' => 2
case '^' => 3
case '&' => 4
Expand All @@ -38,13 +36,13 @@ object Precedence extends (Int => Precedence) {
case ':' => 7
case '+' | '-' => 8
case '*' | '/' | '%' => 9
case _ => if (isScalaLetter(ch)) 1 else 10
case _ => if (isScalaLetter(c)) 1 else 10
})

def apply(level: Int): Precedence = new Precedence(level)
def apply(name: String): Precedence = name match {
case "" | ErrorName => this(-1)
case _ if isAssignmentOp(name) => this(0)
case _ => firstChar(name charAt 0)
case _ => firstChar(name.codePointAt(0))
}
}
44 changes: 43 additions & 1 deletion test/files/run/t1406.scala
Expand Up @@ -9,8 +9,27 @@ object Test extends DirectTest {
// \u10428 isLetter and isLowerCase
def U2 = "\ud801"
def U3 = "\udc28"
// symbol operator So with supplementary char
def U4 = "\ud834"
def U5 = "\udd97"
//\ud83c\udf00 // cyclone 1f300
def U6 = "\ud83c"
def U7 = "\udf00"
// rocket 1f680
def U8 = "\ud83d"
def U9 = "\ude80"
// quintessence 1f700
def UA = "\ud83d"
def UB = "\udf00"

//val a_𝓅 = 1
// 1d4c5 Mathematical Script Small P
// My Pomeranian, Padraig, was nicknamed Little P because of his little p.
def UC = "\ud835"
def UD = "\udcc5"

def code =
s"""class C {
s"""class Identifiers {
| def x = "$U0"
| def y = "$U1"
| def `$U0` = x
Expand All @@ -23,6 +42,29 @@ object Test extends DirectTest {
| def g(x: Any) = x match {
| case $U2$U3 @ _ => $U2$U3
| }
|}
|class Ops {
| def $U4$U5 = 42 // was error: illegal character
| def op_$U4$U5 = 42 // was error: illegal character
| def $U6$U7 = 42
| def op_$U6$U7 = 42
| def $U8$U9 = 42
| def op_$U8$U9 = 42
| def $UA$UB = 42
| def op_$UA$UB = 42
| def $UC$UD = 42
| def op_$UC$UD = 42
|}
|class Strings {
| implicit class Interps(sc: StringContext) {
| def $UC$UD(parts: Any*) = "done"
| }
| def $U4$U5 = 42
| def op_$U4$U5 = 42
| def interpolated = s"$$$$$U4$U5" // a lot of dollars for little sense
| def interpolated_op = s"$$$$$U4$U5" // a lot of dollars for little sense
| def e = "a $UC$UD b"
| def f = $UC$UD"one"
|}""".stripMargin

def show(): Unit = {
Expand Down

0 comments on commit d59d371

Please sign in to comment.