Skip to content

Commit

Permalink
Fix raw token lexer w.r.t. Unicode.
Browse files Browse the repository at this point in the history
fixes #1616
  • Loading branch information
birkenfeld committed Dec 24, 2020
1 parent 6b615e6 commit f65ac3f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 18 deletions.
1 change: 1 addition & 0 deletions CHANGES
Expand Up @@ -20,6 +20,7 @@ Version 2.7.4
- Fixed backtracking string regexes in JavaScript/TypeScript lexers (#1637)
- Limit recursion with nesting Ruby heredocs (#1638)
- Fixed a few inefficient regexes for guessing lexers
- Fixed the raw token lexer handling of Unicode (#1616)


Version 2.7.3
Expand Down
36 changes: 18 additions & 18 deletions pygments/lexers/special.py
Expand Up @@ -39,7 +39,7 @@ def analyse_text(text):

_ttype_cache = {}

line_re = re.compile(b'.*?\n')
line_re = re.compile('.*?\n')


class RawTokenLexer(Lexer):
Expand All @@ -65,30 +65,30 @@ def __init__(self, **options):
Lexer.__init__(self, **options)

def get_tokens(self, text):
if isinstance(text, str):
# raw token stream never has any non-ASCII characters
text = text.encode('ascii')
if self.compress == 'gz':
import gzip
gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
text = gzipfile.read()
elif self.compress == 'bz2':
import bz2
text = bz2.decompress(text)

# do not call Lexer.get_tokens() because we do not want Unicode
# decoding to occur, and stripping is not optional.
text = text.strip(b'\n') + b'\n'
if self.compress:
if isinstance(text, str):
text = text.encode('latin1')
if self.compress == 'gz':
import gzip
gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
text = gzipfile.read()
elif self.compress == 'bz2':
import bz2
text = bz2.decompress(text)
text = text.decode('latin1')

# do not call Lexer.get_tokens() because stripping is not optional.
text = text.strip('\n') + '\n'
for i, t, v in self.get_tokens_unprocessed(text):
yield t, v

def get_tokens_unprocessed(self, text):
length = 0
for match in line_re.finditer(text):
try:
ttypestr, val = match.group().split(b'\t', 1)
ttypestr, val = match.group().rstrip().split('\t', 1)
except ValueError:
val = match.group().decode('ascii', 'replace')
val = match.group()
ttype = Error
else:
ttype = _ttype_cache.get(ttypestr)
Expand All @@ -100,6 +100,6 @@ def get_tokens_unprocessed(self, text):
raise ValueError('malformed token name')
ttype = getattr(ttype, ttype_)
_ttype_cache[ttypestr] = ttype
val = val[2:-2].decode('unicode-escape')
val = val[1:-1].encode().decode('unicode-escape')
yield length, ttype, val
length += len(val)

0 comments on commit f65ac3f

Please sign in to comment.