Merge pull request #6865 from jamalex/content_fallback_paths

CONTENT_FALLBACK_DIRS option along with cherrypy/devserver support.
learningequality · May 26, 2020 · aa4e9d5 · aa4e9d5
2 parents 64d0ea0 + 5d05c61
commit aa4e9d5
Show file tree

Hide file tree

Showing 9 changed files with 334 additions and 72 deletions.
diff --git a/kolibri/core/content/management/commands/scanforcontent.py b/kolibri/core/content/management/commands/scanforcontent.py
@@ -7,9 +7,11 @@
 from ...utils.channel_import import FutureSchemaError
 from ...utils.channel_import import import_channel_from_local_db
 from ...utils.channel_import import InvalidSchemaVersionError
-from ...utils.channels import get_channel_ids_for_content_database_dir
+from ...utils.channels import get_channel_ids_for_content_dirs
+from ...utils.channels import read_channel_metadata_from_db_file
+from ...utils.paths import get_content_database_file_path
 from kolibri.core.content.models import ChannelMetadata
-from kolibri.core.content.utils.paths import get_content_database_dir_path
+from kolibri.core.content.utils.paths import get_all_content_dir_paths
 
 logger = logging.getLogger(__name__)
 
@@ -23,30 +25,96 @@ class Command(BaseCommand):
 
     help = "Scan content and databases in Kolibri folder and updates the database to show if available"
 
+    def add_arguments(self, parser):
+
+        channel_import_mode_help_text = """
+        Specify the desired behavior for import of channel metadata databases. Value must be one of:
+        - newer: only import if database version is higher than what we already have (default)
+        - missing: only import if we do not yet have the channel at all in the primary database
+        - none: do not import new channel databases, and only annotate for channels we already have
+        """
+        parser.add_argument(
+            "--channel-import-mode",
+            type=str,
+            default="newer",
+            choices=["newer", "missing", "none"],
+            required=False,
+            dest="channel_import_mode",
+            help=channel_import_mode_help_text,
+        )
+
+        channels_help_text = """
+        Constrain the content scan to a particular set of channels. Other channels will not be imported
+        or annotated. Separate multiple channel IDs with commas.
+        """
+        parser.add_argument(
+            "--channels",
+            # Split the comma separated string we get, into a list of strings
+            type=lambda x: x.split(","),
+            default=None,
+            required=False,
+            dest="channels",
+            help=channels_help_text,
+        )
+
     def handle(self, *args, **options):
-        storage_channel_ids = get_channel_ids_for_content_database_dir(
-            get_content_database_dir_path()
+
+        channel_import_mode = options["channel_import_mode"]
+        channels_to_include = options["channels"]
+
+        storage_channel_ids = get_channel_ids_for_content_dirs(
+            get_all_content_dir_paths()
         )
         database_channel_ids = list(
             ChannelMetadata.objects.all().values_list("id", flat=True)
         )
         all_channel_ids = set(storage_channel_ids + database_channel_ids)
+
+        # if told not to import any channel databases, constrain to ones we already have
+        if channel_import_mode == "none":
+            all_channel_ids = set(database_channel_ids)
+
+        # if an explicit set of channels was specified, filter out anything not included in that
+        if channels_to_include:
+            all_channel_ids = all_channel_ids.intersection(channels_to_include)
+
         for channel_id in all_channel_ids:
-            if channel_id not in database_channel_ids:
-                try:
-                    import_channel_from_local_db(channel_id)
-                    set_content_visibility_from_disk(channel_id)
-                except (InvalidSchemaVersionError, FutureSchemaError):
-                    logger.warning(
-                        "Tried to import channel {channel_id}, but database file was incompatible".format(
-                            channel_id=channel_id
-                        )
-                    )
-                except DatabaseError:
-                    logger.warning(
-                        "Tried to import channel {channel_id}, but database file was corrupted.".format(
-                            channel_id=channel_id
-                        )
-                    )
-            else:
-                set_content_visibility_from_disk(channel_id)
+
+            disk_path = get_content_database_file_path(channel_id)
+
+            if channel_id not in storage_channel_ids or channel_import_mode == "none":
+                import_database = False
+            elif channel_import_mode == "missing":
+                import_database = channel_id not in database_channel_ids
+            if channel_import_mode == "newer":
+                import_database = self.database_file_is_newer(channel_id, disk_path)
+            if import_database:
+                self.import_channel_database(channel_id, disk_path)
+
+            self.annotate_channel(channel_id)
+
+    def database_file_is_newer(self, channel_id, disk_path):
+        try:
+            disk_channel = read_channel_metadata_from_db_file(disk_path)
+            db_channel = ChannelMetadata.objects.get(id=channel_id)
+            # the version in the primary database is older than the one on disk
+            return disk_channel.version > db_channel.version
+        except DatabaseError:
+            # problem with the database on disk; it can't be considered newer
+            return False
+        except ChannelMetadata.DoesNotExist:
+            # we don't have the channel in our primary database, so it's newer by default
+            return True
+
+    def import_channel_database(self, channel_id, disk_path):
+        logger.info("Attempting import of channel database at: {}".format(disk_path))
+        try:
+            import_channel_from_local_db(channel_id)
+        except (InvalidSchemaVersionError, FutureSchemaError):
+            logger.warning("Database file was incompatible; skipping.")
+        except DatabaseError:
+            logger.warning("Database file was corrupted; skipping.")
+
+    def annotate_channel(self, channel_id):
+        logger.info("Annotating availability for channel: {}".format(channel_id))
+        set_content_visibility_from_disk(channel_id)
diff --git a/kolibri/core/content/upgrade.py b/kolibri/core/content/upgrade.py
@@ -20,8 +20,8 @@
 from kolibri.core.content.utils.channel_import import FutureSchemaError
 from kolibri.core.content.utils.channel_import import import_channel_from_local_db
 from kolibri.core.content.utils.channel_import import InvalidSchemaVersionError
-from kolibri.core.content.utils.channels import get_channel_ids_for_content_database_dir
-from kolibri.core.content.utils.paths import get_content_database_dir_path
+from kolibri.core.content.utils.channels import get_channel_ids_for_content_dirs
+from kolibri.core.content.utils.paths import get_all_content_dir_paths
 from kolibri.core.content.utils.paths import get_content_database_file_path
 from kolibri.core.content.utils.sqlalchemybridge import Bridge
 from kolibri.core.upgrade import version_upgrade
@@ -40,9 +40,7 @@ def import_external_content_dbs():
     scan through the content database folder for all channel content databases,
     and pull the data from each database if we have not already imported it.
     """
-    channel_ids = get_channel_ids_for_content_database_dir(
-        get_content_database_dir_path()
-    )
+    channel_ids = get_channel_ids_for_content_dirs(get_all_content_dir_paths())
     for channel_id in channel_ids:
         if not ChannelMetadata.objects.filter(id=channel_id).exists():
             try:

diff --git a/kolibri/core/content/utils/channels.py b/kolibri/core/content/utils/channels.py
@@ -13,6 +13,16 @@
 logger = logging.getLogger(__name__)
 
 
+def get_channel_ids_for_content_dirs(content_dirs):
+    database_dir_paths = [
+        get_content_database_dir_path(contentfolder=path) for path in content_dirs
+    ]
+    channel_ids = set()
+    for path in database_dir_paths:
+        channel_ids.update(get_channel_ids_for_content_database_dir(path))
+    return list(channel_ids)
+
+
 def get_channel_ids_for_content_database_dir(content_database_dir):
     """
     Returns a list of channel IDs for the channel databases that exist in a content database directory.

diff --git a/kolibri/core/content/utils/paths.py b/kolibri/core/content/utils/paths.py
@@ -17,6 +17,16 @@
 # TODO: add ".epub" and ".epub3" if epub-equivalent of ZipContentView implemented
 
 
+def _maybe_makedirs(path):
+    if not os.path.isdir(path):
+        try:
+            os.makedirs(path)
+        # When importing from USB etc, it does not need to create
+        # directories under external drives that are not writable.
+        except OSError:
+            pass
+
+
 def get_attribute(obj, key):
     """
     Get an attribute from an object, regardless of whether it is a dict or an object
@@ -49,69 +59,134 @@ def get_local_content_storage_file_url(obj):
 # DISK PATHS
 
 
-def get_content_dir_path(datafolder=None):
-    return (
-        os.path.join(datafolder, "content")
-        if datafolder
-        else conf.OPTIONS["Paths"]["CONTENT_DIR"]
-    )
+def get_content_dir_path(datafolder=None, contentfolder=None):
+    if contentfolder:
+        return contentfolder
+    elif datafolder:
+        return os.path.join(datafolder, "content")
+    else:
+        return conf.OPTIONS["Paths"]["CONTENT_DIR"]
 
 
-def get_content_database_dir_path(datafolder=None):
+def get_content_fallback_paths():
+    paths = []
+    fallback_dirs = conf.OPTIONS["Paths"]["CONTENT_FALLBACK_DIRS"]
+    for path in fallback_dirs:
+        path = path.strip()
+        if not path:
+            continue
+        paths.append(path)
+    return paths
+
+
+def get_all_content_dir_paths():
+    return [get_content_dir_path()] + get_content_fallback_paths()
+
+
+def existing_file_path_in_content_fallback_dirs(subpath):
+    # see whether the file exists in any of our content fallback directories
+    for prefix in get_content_fallback_paths():
+        path = os.path.join(prefix, subpath)
+        if os.path.exists(path):
+            return path
+    # if not, return None
+    return None
+
+
+def get_content_database_dir_path(datafolder=None, contentfolder=None):
     """
     Returns the path to the content sqlite databases
     ($HOME/.kolibri/content/databases on POSIX systems, by default)
     """
-    path = os.path.join(get_content_dir_path(datafolder), "databases")
-    if not os.path.isdir(path):
-        try:
-            os.makedirs(path)
-        # When importing from USB, it does not need to create a database
-        # directory under the external drives that are not writable.
-        except OSError:
-            pass
+    path = os.path.join(
+        get_content_dir_path(datafolder=datafolder, contentfolder=contentfolder),
+        "databases",
+    )
+    _maybe_makedirs(path)
     return path
 
 
-def get_content_database_file_path(channel_id, datafolder=None):
+def get_content_database_file_path(channel_id, datafolder=None, contentfolder=None):
     """
     Given a channel_id, returns the path to the sqlite3 file
     ($HOME/.kolibri/content/databases/<channel_id>.sqlite3 on POSIX systems, by default)
     """
-    return os.path.join(
-        get_content_database_dir_path(datafolder), "{}.sqlite3".format(channel_id)
+    suffix = "{}.sqlite3".format(channel_id)
+    primary_path = os.path.join(
+        get_content_database_dir_path(
+            datafolder=datafolder, contentfolder=contentfolder
+        ),
+        suffix,
     )
+    # if the primary path already exists, or the datafolder/contentfolder is overridden, use the primary path
+    if (
+        os.path.exists(primary_path)
+        or datafolder is not None
+        or contentfolder is not None
+    ):
+        return primary_path
+    backup_path = existing_file_path_in_content_fallback_dirs(
+        os.path.join("databases", suffix)
+    )
+    # return backup path if one exists; otherwise, return primary path (even though it doesn't exist yet)
+    return backup_path or primary_path
 
 
-def get_upgrade_content_database_file_path(channel_id, datafolder=None):
+def get_upgrade_content_database_file_path(
+    channel_id, datafolder=None, contentfolder=None
+):
     return os.path.join(
-        get_content_database_dir_path(datafolder),
+        get_content_database_dir_path(
+            datafolder=datafolder, contentfolder=contentfolder
+        ),
         "{}-upgrade.sqlite3".format(channel_id),
     )
 
 
-def get_annotated_content_database_file_path(channel_id, datafolder=None):
+def get_annotated_content_database_file_path(
+    channel_id, datafolder=None, contentfolder=None
+):
     return os.path.join(
-        get_content_database_dir_path(datafolder),
+        get_content_database_dir_path(
+            datafolder=datafolder, contentfolder=contentfolder
+        ),
         "{}-annotated.sqlite3".format(channel_id),
     )
 
 
-def get_content_storage_dir_path(datafolder=None):
-    path = os.path.join(get_content_dir_path(datafolder), "storage")
-    if not os.path.isdir(path):
-        os.makedirs(path)
+def get_content_storage_dir_path(datafolder=None, contentfolder=None):
+    path = os.path.join(
+        get_content_dir_path(datafolder=datafolder, contentfolder=contentfolder),
+        "storage",
+    )
+    _maybe_makedirs(path)
     return path
 
 
-def get_content_storage_file_path(filename, datafolder=None):
+def get_content_storage_file_path(filename, datafolder=None, contentfolder=None):
     if not VALID_STORAGE_FILENAME.match(filename):
         raise InvalidStorageFilenameError(
             "'{}' is not a valid content storage filename".format(filename)
         )
-    return os.path.join(
-        get_content_storage_dir_path(datafolder), filename[0], filename[1], filename
+    suffix = os.path.join(filename[0], filename[1], filename)
+    primary_path = os.path.join(
+        get_content_storage_dir_path(
+            datafolder=datafolder, contentfolder=contentfolder
+        ),
+        suffix,
+    )
+    # if the primary path already exists, or the datapath is overridden, use the primary path
+    if (
+        os.path.exists(primary_path)
+        or datafolder is not None
+        or contentfolder is not None
+    ):
+        return primary_path
+    backup_path = existing_file_path_in_content_fallback_dirs(
+        os.path.join("storage", suffix)
     )
+    # return backup path if one exists; otherwise, return the primary path (even though it doesn't exist yet)
+    return backup_path or primary_path
 
 
 # URL PATHS

diff --git a/kolibri/core/urls.py b/kolibri/core/urls.py
@@ -40,10 +40,11 @@
 from django.conf.urls.static import static
 
 from .views import GuestRedirectView
-from .views import StatusCheckView
 from .views import logout_view
 from .views import RootURLRedirectView
 from .views import set_language
+from .views import static_serve_with_fallbacks
+from .views import StatusCheckView
 from .views import UnsupportedBrowserView
 from kolibri.core.content.utils import paths
 from kolibri.core.device.translation import i18n_patterns
@@ -75,6 +76,7 @@
 
 urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
 
+content_dirs = [paths.get_content_dir_path()] + paths.get_content_fallback_paths()
 urlpatterns += static(
-    paths.get_content_url("/"), document_root=paths.get_content_dir_path()
+    paths.get_content_url("/"), view=static_serve_with_fallbacks(content_dirs)
 )