forked from hail-is/hail
-
Notifications
You must be signed in to change notification settings - Fork 0
/
context.py
461 lines (373 loc) · 14.4 KB
/
context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
import sys
import os
from urllib.parse import urlparse, urlunparse
import pkg_resources
from pyspark import SparkContext
import hail
from hail.genetics.reference_genome import ReferenceGenome
from hail.typecheck import nullable, typecheck, typecheck_method, enumeration, dictof
from hail.utils import get_env_or_default
from hail.utils.java import Env, FatalError, warning
from hail.backend import Backend
def _get_tmpdir(tmpdir):
if tmpdir is None:
tmpdir = '/tmp'
return tmpdir
def _get_local_tmpdir(local_tmpdir):
local_tmpdir = get_env_or_default(local_tmpdir, 'TMPDIR', 'file:///tmp')
r = urlparse(local_tmpdir)
if not r.scheme:
r = r._replace(scheme='file')
elif r.scheme != 'file':
raise ValueError('invalid local_tmpfile: must use scheme file, got scheme {r.scheme}')
return urlunparse(r)
def _get_log(log):
if log is None:
py_version = version()
log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'),
suffix=f'-{py_version}.log')
return log
class HailContext(object):
@typecheck_method(log=str,
quiet=bool,
append=bool,
tmpdir=str,
local_tmpdir=str,
default_reference=str,
global_seed=nullable(int),
backend=Backend)
def __init__(self, log, quiet, append, tmpdir, local_tmpdir,
default_reference, global_seed, backend):
assert not Env._hc
super(HailContext, self).__init__()
self._log = log
self._tmpdir = tmpdir
self._local_tmpdir = local_tmpdir
self._backend = backend
self._warn_cols_order = True
self._warn_entries_order = True
Env._hc = self
ReferenceGenome._from_config(self._backend.get_reference('GRCh37'), True)
ReferenceGenome._from_config(self._backend.get_reference('GRCh38'), True)
ReferenceGenome._from_config(self._backend.get_reference('GRCm38'), True)
ReferenceGenome._from_config(self._backend.get_reference('CanFam3'), True)
if default_reference in ReferenceGenome._references:
self._default_ref = ReferenceGenome._references[default_reference]
else:
self._default_ref = ReferenceGenome.read(default_reference)
if not quiet:
py_version = version()
sys.stderr.write(
'Welcome to\n'
' __ __ <>__\n'
' / /_/ /__ __/ /\n'
' / __ / _ `/ / /\n'
' /_/ /_/\\_,_/_/_/ version {}\n'.format(py_version))
if py_version.startswith('devel'):
sys.stderr.write('NOTE: This is a beta version. Interfaces may change\n'
' during the beta period. We recommend pulling\n'
' the latest changes weekly.\n')
sys.stderr.write(f'LOGGING: writing to {log}\n')
if global_seed is None:
global_seed = 6348563392232659379
Env.set_seed(global_seed)
@property
def default_reference(self):
return self._default_ref
def stop(self):
self._backend.stop()
self._backend = None
Env._hc = None
Env._dummy_table = None
Env._seed_generator = None
hail.ir.clear_session_functions()
ReferenceGenome._references = {}
@typecheck(sc=nullable(SparkContext),
app_name=str,
master=nullable(str),
local=str,
log=nullable(str),
quiet=bool,
append=bool,
min_block_size=int,
branching_factor=int,
tmp_dir=str,
default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38', 'CanFam3'),
idempotent=bool,
global_seed=nullable(int),
spark_conf=nullable(dictof(str, str)),
skip_logging_configuration=bool,
local_tmpdir=nullable(str),
_optimizer_iterations=nullable(int))
def init(sc=None, app_name='Hail', master=None, local='local[*]',
log=None, quiet=False, append=False,
min_block_size=0, branching_factor=50, tmp_dir='/tmp',
default_reference='GRCh37', idempotent=False,
global_seed=6348563392232659379,
spark_conf=None,
skip_logging_configuration=False,
local_tmpdir=None,
_optimizer_iterations=None):
"""Initialize Hail and Spark.
Examples
--------
Import and initialize Hail using GRCh38 as the default reference genome:
>>> import hail as hl
>>> hl.init(default_reference='GRCh38') # doctest: +SKIP
Notes
-----
Hail is not only a Python library; most of Hail is written in Java/Scala
and runs together with Apache Spark in the Java Virtual Machine (JVM).
In order to use Hail, a JVM needs to run as well. The :func:`.init`
function is used to initialize Hail and Spark.
This function also sets global configuration parameters used for the Hail
session, like the default reference genome and log file location.
This function will be called automatically (with default parameters) if
any Hail functionality requiring the backend (most of the libary!) is used.
To initialize Hail explicitly with non-default arguments, be sure to do so
directly after importing the module, as in the above example.
Note
----
If a :class:`pyspark.SparkContext` is already running, then Hail must be
initialized with it as an argument:
>>> hl.init(sc=sc) # doctest: +SKIP
See Also
--------
:func:`.stop`
Parameters
----------
sc : pyspark.SparkContext, optional
Spark context. By default, a Spark context will be created.
app_name : :class:`str`
Spark application name.
master : :class:`str`, optional
URL identifying the Spark leader (master) node or `local[N]` for local clusters.
local : :class:`str`
Local-mode core limit indicator. Must either be `local[N]` where N is a
positive integer or `local[*]`. The latter indicates Spark should use all
cores available. `local[*]` does not respect most containerization CPU
limits. This option is only used if `master` is unset and `spark.master`
is not set in the Spark configuration.
log : :class:`str`
Local path for Hail log file. Does not currently support distributed
file systems like Google Storage, S3, or HDFS.
quiet : :obj:`bool`
Print fewer log messages.
append : :obj:`bool`
Append to the end of the log file.
min_block_size : :obj:`int`
Minimum file block size in MB.
branching_factor : :obj:`int`
Branching factor for tree aggregation.
tmp_dir : :class:`str`, optional
Networked temporary directory. Must be a network-visible file
path. Defaults to /tmp in the default scheme.
default_reference : :class:`str`
Default reference genome. Either ``'GRCh37'``, ``'GRCh38'``,
``'GRCm38'``, or ``'CanFam3'``.
idempotent : :obj:`bool`
If ``True``, calling this function is a no-op if Hail has already been initialized.
global_seed : :obj:`int`, optional
Global random seed.
spark_conf : :obj:`dict` of :class:`str` to :class`str`, optional
Spark configuration parameters.
skip_logging_configuration : :obj:`bool`
Skip logging configuration in java and python.
local_tmpdir : :class:`str`, optional
Local temporary directory. Used on driver and executor nodes.
Must use the file scheme. Defaults to TMPDIR, or /tmp.
"""
from hail.backend.spark_backend import SparkBackend
if Env._hc:
if idempotent:
return
else:
warning('Hail has already been initialized. If this call was intended to change configuration,'
' close the session with hl.stop() first.')
log = _get_log(log)
tmpdir = _get_tmpdir(tmp_dir)
local_tmpdir = _get_local_tmpdir(local_tmpdir)
optimizer_iterations = get_env_or_default(_optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3)
backend = SparkBackend(
idempotent, sc, spark_conf, app_name, master, local, log,
quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir,
skip_logging_configuration, optimizer_iterations)
HailContext(
log, quiet, append, tmp_dir, local_tmpdir, default_reference,
global_seed, backend)
@typecheck(
billing_project=nullable(str),
bucket=nullable(str),
log=nullable(str),
quiet=bool,
append=bool,
tmpdir=nullable(str),
local_tmpdir=nullable(str),
default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38', 'CanFam3'),
global_seed=nullable(int),
skip_logging_configuration=bool)
def init_service(
billing_project: str = None,
bucket: str = None,
log=None,
quiet=False,
append=False,
tmpdir=None,
local_tmpdir=None,
default_reference='GRCh37',
global_seed=6348563392232659379,
skip_logging_configuration=False):
from hail.backend.service_backend import ServiceBackend
backend = ServiceBackend(billing_project, bucket, skip_logging_configuration=skip_logging_configuration)
log = _get_log(log)
tmpdir = _get_tmpdir(tmpdir)
local_tmpdir = _get_local_tmpdir(local_tmpdir)
HailContext(
log, quiet, append, tmpdir, local_tmpdir, default_reference,
global_seed, backend)
@typecheck(
log=nullable(str),
quiet=bool,
append=bool,
branching_factor=int,
tmpdir=nullable(str),
default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38', 'CanFam3'),
global_seed=nullable(int),
skip_logging_configuration=bool,
_optimizer_iterations=nullable(int))
def init_local(
log=None,
quiet=False,
append=False,
branching_factor=50,
tmpdir=None,
default_reference='GRCh37',
global_seed=6348563392232659379,
skip_logging_configuration=False,
_optimizer_iterations=None):
from hail.backend.local_backend import LocalBackend
log = _get_log(log)
tmpdir = _get_tmpdir(tmpdir)
optimizer_iterations = get_env_or_default(_optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3)
backend = LocalBackend(
tmpdir, log, quiet, append, branching_factor,
skip_logging_configuration, optimizer_iterations)
HailContext(
log, quiet, append, tmpdir, tmpdir, default_reference,
global_seed, backend)
def version():
"""Get the installed hail version.
Returns
-------
str
"""
if hail.__version__ is None:
# https://stackoverflow.com/questions/6028000/how-to-read-a-static-file-from-inside-a-python-package
hail.__version__ = pkg_resources.resource_string(__name__, 'hail_version').decode().strip()
return hail.__version__
def _hail_cite_url():
v = version()
[tag, sha_prefix] = v.split("-")
if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
# pip installed
return f"https://github.com/hail-is/hail/releases/tag/{tag}"
return f"https://github.com/hail-is/hail/commit/{sha_prefix}"
def citation(*, bibtex=False):
"""Generate a Hail citation.
Parameters
----------
bibtex : bool
Generate a citation in BibTeX form.
Returns
-------
str
"""
if bibtex:
return f"@misc{{Hail," \
f" author = {{Hail Team}}," \
f" title = {{Hail}}," \
f" howpublished = {{\\url{{{_hail_cite_url()}}}}}" \
f"}}"
return f"Hail Team. Hail {version()}. {_hail_cite_url()}."
def cite_hail():
return citation(bibtex=False)
def cite_hail_bibtex():
return citation(bibtex=True)
def stop():
"""Stop the currently running Hail session."""
if Env._hc:
Env.hc().stop()
def spark_context():
"""Returns the active Spark context.
Returns
-------
:class:`pyspark.SparkContext`
"""
return Env.spark_backend('spark_context').sc
def current_backend():
return Env.hc()._backend
def default_reference():
"""Returns the default reference genome ``'GRCh37'``.
Returns
-------
:class:`.ReferenceGenome`
"""
return Env.hc().default_reference
def get_reference(name):
"""Returns the reference genome corresponding to `name`.
Notes
-----
Hail's built-in references are ``'GRCh37'``, ``GRCh38'``, ``'GRCm38'``, and
``'CanFam3'``.
The contig names and lengths come from the GATK resource bundle:
`human_g1k_v37.dict
<ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.dict>`__
and `Homo_sapiens_assembly38.dict
<ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Homo_sapiens_assembly38.dict>`__.
If ``name='default'``, the value of :func:`.default_reference` is returned.
Parameters
----------
name : :class:`str`
Name of a previously loaded reference genome or one of Hail's built-in
references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'``, and
``'default'``.
Returns
-------
:class:`.ReferenceGenome`
"""
Env.hc()
if name == 'default':
return default_reference()
else:
return ReferenceGenome._references[name]
@typecheck(seed=int)
def set_global_seed(seed):
"""Sets Hail's global seed to `seed`.
Parameters
----------
seed : :obj:`int`
Integer used to seed Hail's random number generator
"""
Env.set_seed(seed)
def _set_flags(**flags):
available = set(Env.backend()._jhc.flags().available())
invalid = []
for flag, value in flags.items():
if flag in available:
Env.backend()._jhc.flags().set(flag, value)
else:
invalid.append(flag)
if len(invalid) != 0:
raise FatalError("Flags {} not valid. Valid flags: \n {}"
.format(', '.join(invalid), '\n '.join(available)))
def _get_flags(*flags):
return {flag: Env.backend()._jhc.flags().get(flag) for flag in flags}
def debug_info():
hail_jar_path = None
if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
hail_jar_path = pkg_resources.resource_filename(__name__, "hail-all-spark.jar")
return {
'spark_conf': spark_context()._conf.getAll(),
'hail_jar_path': hail_jar_path,
'version': version()
}