Merge pull request #1653 from opensafely-core/evansd/sandbox

Isolate user-supplied code
opensafely-core · Nov 1, 2023 · eae0ece · eae0ece
2 parents e228584 + 1eb3df4
commit eae0ece
Show file tree

Hide file tree

Showing 47 changed files with 1,482 additions and 232 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -130,6 +130,7 @@ COPY ehrql /app/ehrql
 RUN python -m compileall /app/ehrql
 COPY databuilder /app/databuilder
 RUN python -m compileall /app/databuilder
+COPY bin /app/bin
 
 # The following build details will change.
 # These are the last step to make better use of Docker's build cache,

diff --git a/Justfile b/Justfile
@@ -83,6 +83,7 @@ devenv: prodenv _virtualenv && _install-precommit
     test requirements.dev.txt -nt $VIRTUAL_ENV/.dev || exit 0
 
     $PIP install -r requirements.dev.txt
+    $PIP install --no-deps --editable .
     touch $VIRTUAL_ENV/.dev
 
 

diff --git a/bin/LICENSE b/bin/LICENSE
@@ -0,0 +1,20 @@
+The accompanying binary `pledge` is included under the license below.
+
+---
+
+ISC License
+
+Copyright 2020 Justine Alexandra Roberts Tunney
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
diff --git a/bin/pledge b/bin/pledge
diff --git a/docs/includes/generated_docs/cli.md b/docs/includes/generated_docs/cli.md
@@ -67,6 +67,20 @@ Experimental command for running assurance tests.
 Internal command for testing the database connection configuration.
 </p>
 
+<div class="attr-heading">
+  <a href="#serialize-definition"><tt>serialize-definition</tt></a>
+</div>
+<p class="indent">
+Internal command for serializing a definition file to a JSON representation.
+</p>
+
+<div class="attr-heading">
+  <a href="#isolation-report"><tt>isolation-report</tt></a>
+</div>
+<p class="indent">
+Internal command for testing code isolation support.
+</p>
+
 </div>
 
 <div class="attr-heading" id="ehrql.help">
@@ -591,3 +605,84 @@ Dotted import path to Backend class, or one of: `emis`, `tpp`
 Database connection string.
 
 </div>
+
+
+<h2 id="serialize-definition" data-toc-label="serialize-definition" markdown>
+  serialize-definition
+</h2>
+```
+ehrql serialize-definition DEFINITION_FILE [--help]
+      [--definition-type DEFINITION_TYPE] [--output OUTPUT_FILE]
+      [ -- ... PARAMETERS ...]
+```
+Internal command for serializing a definition file to a JSON representation.
+
+Note that **this in an internal command** and not intended for end users.
+
+<div class="attr-heading" id="serialize-definition.definition_file">
+  <tt>DEFINITION_FILE</tt>
+  <a class="headerlink" href="#serialize-definition.definition_file" title="Permanent link">🔗</a>
+</div>
+<div markdown="block" class="indent">
+Definition file path
+
+</div>
+
+<div class="attr-heading" id="serialize-definition.help">
+  <tt>-h, --help</tt>
+  <a class="headerlink" href="#serialize-definition.help" title="Permanent link">🔗</a>
+</div>
+<div markdown="block" class="indent">
+show this help message and exit
+
+</div>
+
+<div class="attr-heading" id="serialize-definition.definition-type">
+  <tt>-t, --definition-type DEFINITION_TYPE</tt>
+  <a class="headerlink" href="#serialize-definition.definition-type" title="Permanent link">🔗</a>
+</div>
+<div markdown="block" class="indent">
+Options: `dataset`, `measures`, `test`
+
+</div>
+
+<div class="attr-heading" id="serialize-definition.output">
+  <tt>-o, --output OUTPUT_FILE</tt>
+  <a class="headerlink" href="#serialize-definition.output" title="Permanent link">🔗</a>
+</div>
+<div markdown="block" class="indent">
+Output file path (stdout by default)
+
+</div>
+
+<div class="attr-heading" id="serialize-definition.user_args">
+  <tt>PARAMETERS</tt>
+  <a class="headerlink" href="#serialize-definition.user_args" title="Permanent link">🔗</a>
+</div>
+<div markdown="block" class="indent">
+Parameters are extra arguments you can pass to your Python definition file. They must be
+supplied after all ehrQL arguments and separated from the ehrQL arguments with a
+double-dash ` -- `.
+
+
+</div>
+
+
+<h2 id="isolation-report" data-toc-label="isolation-report" markdown>
+  isolation-report
+</h2>
+```
+ehrql isolation-report [--help]
+```
+Internal command for testing code isolation support.
+
+Note that **this in an internal command** and not intended for end users.
+
+<div class="attr-heading" id="isolation-report.help">
+  <tt>-h, --help</tt>
+  <a class="headerlink" href="#isolation-report.help" title="Permanent link">🔗</a>
+</div>
+<div markdown="block" class="indent">
+show this help message and exit
+
+</div>
diff --git a/ehrql/__main__.py b/ehrql/__main__.py
@@ -8,17 +8,19 @@
 
 from ehrql import __version__
 from ehrql.file_formats import FILE_FORMATS, get_file_extension
+from ehrql.loaders import DEFINITION_LOADERS, DefinitionError
 from ehrql.utils.string_utils import strip_indent
 
 from .main import (
-    CommandError,
     assure,
     create_dummy_tables,
     dump_dataset_sql,
     dump_example_data,
     generate_dataset,
     generate_measures,
+    run_isolation_report,
     run_sandbox,
+    serialize_definition,
     test_connection,
 )
 
@@ -88,7 +90,7 @@ def main(args, environ=None):
 
     try:
         function(**kwargs)
-    except CommandError as e:
+    except DefinitionError as e:
         print(str(e), file=sys.stderr)
         sys.exit(1)
     finally:
@@ -128,6 +130,8 @@ def show_help(**kwargs):
     add_create_dummy_tables(subparsers, environ, user_args)
     add_assure(subparsers, environ, user_args)
     add_test_connection(subparsers, environ, user_args)
+    add_serialize_definition(subparsers, environ, user_args)
+    add_isolation_report(subparsers, environ, user_args)
 
     return parser
 
@@ -226,6 +230,7 @@ def add_create_dummy_tables(subparsers, environ, user_args):
     )
     parser.set_defaults(function=create_dummy_tables)
     parser.set_defaults(user_args=user_args)
+    parser.set_defaults(environ=environ)
     add_dataset_definition_file_argument(parser, environ)
     parser.add_argument(
         "dummy_tables_path",
@@ -348,6 +353,60 @@ def add_dump_example_data(subparsers, environ, user_args):
     parser.set_defaults(environ=environ)
 
 
+def add_serialize_definition(subparsers, environ, user_args):
+    parser = subparsers.add_parser(
+        "serialize-definition",
+        help=strip_indent(
+            """
+            Internal command for serializing a definition file to a JSON representation.
+
+            Note that **this in an internal command** and not intended for end users.
+            """
+        ),
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.set_defaults(function=serialize_definition)
+    parser.set_defaults(environ=environ)
+    parser.set_defaults(user_args=user_args)
+
+    parser.add_argument(
+        "-t",
+        "--definition-type",
+        type=str,
+        choices=DEFINITION_LOADERS.keys(),
+        default="dataset",
+        help=f"Options: {backtick_join(DEFINITION_LOADERS.keys())}",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help=strip_indent("Output file path (stdout by default)"),
+        type=Path,
+        dest="output_file",
+    )
+    parser.add_argument(
+        "definition_file",
+        help="Definition file path",
+        type=existing_python_file,
+        metavar="definition_file",
+    )
+
+
+def add_isolation_report(subparsers, environ, user_args):
+    parser = subparsers.add_parser(
+        "isolation-report",
+        help=strip_indent(
+            """
+            Internal command for testing code isolation support.
+
+            Note that **this in an internal command** and not intended for end users.
+            """
+        ),
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.set_defaults(function=run_isolation_report)
+
+
 def create_internal_argument_group(parser, environ):
     return parser.add_argument_group(
         title="Internal Arguments",

diff --git a/ehrql/assurance.py b/ehrql/assurance.py
@@ -1,6 +1,5 @@
 from ehrql.query_engines.in_memory import InMemoryQueryEngine
 from ehrql.query_engines.in_memory_database import InMemoryDatabase
-from ehrql.query_language import compile
 from ehrql.query_model.introspection import get_table_nodes
 from ehrql.utils.orm_utils import orm_classes_from_tables
 
@@ -10,8 +9,8 @@
 UNEXPECTED_VALUE = "unexpected-value"
 
 
-def validate(dataset, test_data):
-    """Given some input data, validates that a dataset definition produces the given
+def validate(variable_definitions, test_data):
+    """Given some input data, validates that the given variables produce the given
     expected output.
 
     test_data contains two parts: data for each patient to be inserted into the
@@ -47,7 +46,6 @@ def validate(dataset, test_data):
     """
 
     # Create objects to insert into database
-    variable_definitions = compile(dataset)
     table_nodes = get_table_nodes(*variable_definitions.values())
     orm_classes = orm_classes_from_tables(table_nodes)
     input_data = []

diff --git a/ehrql/docs/language.py b/ehrql/docs/language.py
@@ -16,7 +16,7 @@
     # We document `WhenThen` and `when` as part of `case`
     ql.WhenThen,
     ql.when,
-    ql.DummyDatasetConfig,
+    ql.DummyDataConfig,
 }
 
 

diff --git a/ehrql/file_formats/arrow.py b/ehrql/file_formats/arrow.py
@@ -177,14 +177,14 @@ def next_transposed_batch():
 
 class ArrowDatasetReader(BaseDatasetReader):
     def _open(self):
-        self.fileobj = pyarrow.memory_map(str(self.filename), "rb")
-        self.reader = pyarrow.ipc.open_file(self.fileobj)
+        self._fileobj = pyarrow.memory_map(str(self.filename), "rb")
+        self._reader = pyarrow.ipc.open_file(self._fileobj)
 
     def _validate_basic(self):
         # Arrow enforces that all record batches have a consistent schema and that any
         # categorical columns use the same dictionary, so we only need to get the first
         # batch in order to validate
-        batch = self.reader.get_record_batch(0)
+        batch = self._reader.get_record_batch(0)
         validate_columns(batch.schema.names, self.column_specs)
         errors = []
         for name, spec in self.column_specs.items():
@@ -215,15 +215,15 @@ def _validate_column(self, name, column, spec):
                 )
 
     def __iter__(self):
-        for i in range(self.reader.num_record_batches):
-            batch = self.reader.get_record_batch(i)
+        for i in range(self._reader.num_record_batches):
+            batch = self._reader.get_record_batch(i)
             # Use `zip(*...)` to transpose from column-wise to row-wise
             yield from zip(
                 *(batch.column(name).to_pylist() for name in self.column_specs)
             )
 
     def close(self):
-        # `self.reader` does not need closing: it acts as a contextmanager, but its exit
+        # `self._reader` does not need closing: it acts as a contextmanager, but its exit
         # method is a no-op, see:
         # https://github.com/apache/arrow/blob/1706b095/python/pyarrow/ipc.pxi#L1032-L1036
-        self.fileobj.close()
+        self._fileobj.close()
diff --git a/ehrql/file_formats/base.py b/ehrql/file_formats/base.py
@@ -1,9 +1,16 @@
+import pathlib
+
+
 class ValidationError(Exception):
     pass
 
 
 class BaseDatasetReader:
     def __init__(self, filename, column_specs):
+        if not isinstance(filename, pathlib.Path):
+            raise ValidationError(
+                f"`filename` must be a pathlib.Path instance got: {filename!r}"
+            )
         self.filename = filename
         self.column_specs = column_specs
         self._open()

diff --git a/ehrql/file_formats/csv.py b/ehrql/file_formats/csv.py
@@ -58,12 +58,7 @@ def format_bool(value):
     return "T" if value else "F"
 
 
-class CSVStreamDatasetReader(BaseDatasetReader):
-    def _open(self):
-        # Support supplying the file object directly as the first argument for testing
-        # purposes
-        self.fileobj = self.filename
-
+class BaseCSVDatasetReader(BaseDatasetReader):
     def _validate_basic(self):
         # CSV being what it is we can't properly validate the types it contains without
         # reading the entire thing, which we don't want do. So we read the first 10 rows
@@ -72,8 +67,8 @@ def _validate_basic(self):
             pass
 
     def __iter__(self):
-        self.fileobj.seek(0)
-        reader = csv.reader(self.fileobj)
+        self._fileobj.seek(0)
+        reader = csv.reader(self._fileobj)
         headers = next(reader)
         validate_columns(headers, self.column_specs.keys())
         row_parser = create_row_parser(headers, self.column_specs)
@@ -84,17 +79,17 @@ def __iter__(self):
                 raise ValidationError(f"row {n}: {e}")
 
     def close(self):
-        self.fileobj.close()
+        self._fileobj.close()
 
 
-class CSVDatasetReader(CSVStreamDatasetReader):
+class CSVDatasetReader(BaseCSVDatasetReader):
     def _open(self):
-        self.fileobj = open(self.filename, newline="")
+        self._fileobj = open(self.filename, newline="")
 
 
-class CSVGZDatasetReader(CSVStreamDatasetReader):
+class CSVGZDatasetReader(BaseCSVDatasetReader):
     def _open(self):
-        self.fileobj = gzip.open(self.filename, "rt", newline="")
+        self._fileobj = gzip.open(self.filename, "rt", newline="")
 
 
 def create_row_parser(headers, column_specs):