-
Notifications
You must be signed in to change notification settings - Fork 728
/
ghc_cmm.rb
336 lines (280 loc) · 8.76 KB
/
ghc_cmm.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# -*- coding: utf-8 -*- #
# frozen_string_literal: true
# C minus minus (Cmm) is a pun on the name C++. It's an intermediate language
# of the Glasgow Haskell Compiler (GHC) that is very similar to C, but with
# many features missing and some special constructs.
#
# Cmm is a dialect of C--. The goal of this lexer is to use what GHC produces
# and parses (Cmm); C-- itself is not supported.
#
# https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/cmm-syntax
#
module Rouge
module Lexers
class GHCCmm < RegexLexer
title "GHC Cmm (C--)"
desc "GHC Cmm is the intermediate representation of the GHC Haskell compiler"
tag 'ghc-cmm'
aliases 'cmm'
filenames '*.cmm', '*.dump-cmm', '*.dump-cmm-*'
macros = %r(#(?:define|include|endif|else|if))
ws = %r(\s|//.*?\n|/[*](?:[^*]|(?:[*][^/]))*[*]+/)mx
id = %r((?!#{macros})[\w#\$%_']+)
complex_id = %r(
(?:[\w#$%_']|\(\)|\(,\)|\[\]|[0-9])*
(?:[\w#$%_']+)
)mx
state :root do
rule %r/\s+/m, Text
# sections markers
rule %r/^=====.*=====$/, Generic::Heading
# timestamps
rule %r/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ UTC$/, Comment::Single
mixin :detect_section
mixin :preprocessor_macros
mixin :info_tbls
mixin :comments
mixin :literals
mixin :keywords
mixin :types
mixin :infos
mixin :names
mixin :operators
# escaped newline
rule %r/\\\n/, Comment::Preproc
# rest is Text
rule %r/./, Text
end
state :detect_section do
rule %r/(section)(\s+)/ do |m|
token Keyword, m[1]
token Text, m[2]
push :section
end
end
state :section do
rule %r/"(data|cstring|text|rodata|relrodata|bss)"/, Name::Builtin
rule %r/{/, Punctuation, :pop!
mixin :names
mixin :operators
mixin :keywords
rule %r/\s/, Text
end
state :preprocessor_macros do
rule %r/#(include|endif|else|if)/, Comment::Preproc
rule %r{
(\#define)
(#{ws}*)
(#{id})
}mx do |m|
token Comment::Preproc, m[1]
recurse m[2]
token Name::Label, m[3]
end
end
state :info_tbls do
rule %r/({ )(info_tbls)(:)/ do |m|
token Punctuation, m[1]
token Name::Entity, m[2]
token Punctuation, m[3]
push :info_tbls_body
end
end
state :info_tbls_body do
rule %r/}/, Punctuation, :pop!
rule %r/{/, Punctuation, :info_tbls_body
rule %r/(?=label:)/ do
push :label
end
rule %r{(\()(#{complex_id})(,)}mx do |m|
token Punctuation, m[1]
token Name::Label, m[2]
token Punctuation, m[3]
end
mixin :literals
mixin :infos
mixin :keywords
mixin :operators
rule %r/#{id}/, Text
rule %r/\s/, Text
end
state :label do
mixin :infos
mixin :names
mixin :keywords
mixin :operators
rule %r/[^\S\n]/, Text # Tab, space, etc. but not newline!
rule %r/\n/, Text, :pop!
end
state :comments do
rule %r/\/{2}.*/, Comment::Single
rule %r/\(likely.*?\)/, Comment
rule %r/\/\*.*?\*\//m, Comment::Multiline
end
state :literals do
rule %r/-?[0-9]+\.[0-9]+/, Literal::Number::Float
rule %r/-?[0-9]+/, Literal::Number::Integer
rule %r/"/, Literal::String::Delimiter, :literal_string
end
state :literal_string do
# quotes
rule %r/\\./, Literal::String::Escape
rule %r/%./, Literal::String::Symbol
rule %r/"/, Literal::String::Delimiter, :pop!
rule %r/./, Literal::String
end
state :operators do
rule %r/\.\./, Operator
rule %r/[+\-*\/<>=!&|~]/, Operator
rule %r/[\[\].{}:;,()]/, Punctuation
end
state :keywords do
rule %r/const/, Keyword::Constant
rule %r/"/, Literal::String::Double
rule %r/(switch)([^{]*)({)/ do |m|
token Keyword, m[1]
recurse m[2]
token Punctuation, m[3]
end
rule %r/(arg|result)(#{ws}+)(hints)(:)/ do |m|
token Name::Property, m[1]
recurse m[2]
token Name::Property, m[3]
token Punctuation, m[4]
end
rule %r/(returns)(#{ws}*)(to)/ do |m|
token Keyword, m[1]
recurse m[2]
token Keyword, m[3]
end
rule %r/(never)(#{ws}*)(returns)/ do |m|
token Keyword, m[1]
recurse m[2]
token Keyword, m[3]
end
rule %r{(return)(#{ws}*)(\()} do |m|
token Keyword, m[1]
recurse m[2]
token Punctuation, m[3]
end
rule %r{(if|else|goto|call|offset|import|jump|ccall|foreign|prim|case|unwind|export|reserve|push)(#{ws})} do |m|
token Keyword, m[1]
recurse m[2]
end
rule %r{(default)(#{ws}*)(:)} do |m|
token Keyword, m[1]
recurse m[2]
token Punctuation, m[3]
end
end
state :types do
# Memory access: `type[42]`
# Note: Only a token for type is produced.
rule %r/(#{id})(?=\[[^\]])/ do |m|
token Keyword::Type, m[1]
end
# Array type: `type[]`
rule %r/(#{id}\[\])/ do |m|
token Keyword::Type, m[1]
end
# Capture macro substitutions before lexing typed declarations
# I.e. there is no type in `PREPROCESSOR_MACRO_VARIABLE someFun()`
rule %r{
(^#{id})
(#{ws}+)
(#{id})
(#{ws}*)
(\()
}mx do |m|
token Name::Label, m[1]
recurse m[2]
token Name::Function, m[3]
recurse m[4]
token Punctuation, m[5]
end
# Type in variable or parameter declaration:
# `type /* optional whitespace */ var_name /* optional whitespace */;`
# `type /* optional whitespace */ var_name /* optional whitespace */, var_name2`
# `(type /* optional whitespace */ var_name /* optional whitespace */)`
# Note: Only the token for type is produced here.
rule %r{
(^#{id})
(#{ws}+)
(#{id})
}mx do |m|
token Keyword::Type, m[1]
recurse m[2]
token Name::Label, m[3]
end
end
state :infos do
rule %r/(args|res|upd|label|rep|srt|arity|fun_type|arg_space|updfr_space)(:)/ do |m|
token Name::Property, m[1]
token Punctuation, m[2]
end
rule %r/(stack_info)(:)/ do |m|
token Name::Entity, m[1]
token Punctuation, m[2]
end
end
state :names do
rule %r/(::)(#{ws}*)([A-Z]\w+)/ do |m|
token Operator, m[1]
recurse m[2]
token Keyword::Type, m[3]
end
rule %r/<(#{id})>/, Name::Builtin
rule %r/(Sp|SpLim|Hp|HpLim|HpAlloc|BaseReg|CurrentNursery|CurrentTSO|R\d{1,2}|gcptr)(?!#{id})/, Name::Variable::Global
rule %r/CLOSURE/, Keyword::Type
rule %r/([A-Z]#{id})(\.)/ do |m|
token Name::Namespace, m[1]
token Punctuation, m[2]
push :namespace_name
end
# Inline function calls:
# ```
# arg1 `lt` arg2
# ```
rule %r/(`)(#{id})(`)/ do |m|
token Punctuation, m[1]
token Name::Function, m[2]
token Punctuation, m[3]
end
# Function: `name /* optional whitespace */ (`
# Function (arguments via explicit stack handling): `name /* optional whitespace */ {`
rule %r{(?=
#{complex_id}
#{ws}*
[\{\(]
)}mx do
push :function
end
rule %r/#{complex_id}/, Name::Label
end
state :namespace_name do
rule %r/([A-Z]#{id})(\.)/ do |m|
token Name::Namespace, m[1]
token Punctuation, m[2]
end
rule %r{(#{complex_id})(#{ws}*)([\{\(])}mx do |m|
token Name::Function, m[1]
recurse m[2]
token Punctuation, m[3]
pop!
end
rule %r/#{complex_id}/, Name::Label, :pop!
rule %r/(?=.)/m do
pop!
end
end
state :function do
rule %r/INFO_TABLE_FUN|INFO_TABLE_CONSTR|INFO_TABLE_SELECTOR|INFO_TABLE_RET|INFO_TABLE/, Name::Builtin
rule %r/%#{id}/, Name::Builtin
rule %r/#{complex_id}/, Name::Function
rule %r/\s+/, Text
rule %r/[({]/, Punctuation, :pop!
mixin :comments
end
end
end
end