From 2cc7cbffcd542011d6544547e63cb7faa5a1524c Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 13:03:14 -0700
Subject: [PATCH 1/7] Refactor restore-channel utilities

---
 .../management/commands/restore_channel.py    |   61 +-
 .../tests/test_restore_channel.py             |  190 ---
 .../contentcuration/utils/files.py            |   35 +-
 .../contentcuration/utils/import_tools.py     | 1127 +++++++++--------
 4 files changed, 674 insertions(+), 739 deletions(-)
 delete mode 100644 contentcuration/contentcuration/tests/test_restore_channel.py

diff --git a/contentcuration/contentcuration/management/commands/restore_channel.py b/contentcuration/contentcuration/management/commands/restore_channel.py
index 6133ec3806..49b785f725 100644
--- a/contentcuration/contentcuration/management/commands/restore_channel.py
+++ b/contentcuration/contentcuration/management/commands/restore_channel.py
@@ -2,27 +2,66 @@
 
 from django.core.management.base import BaseCommand
 
-from contentcuration.utils.import_tools import import_channel
+from contentcuration.utils.import_tools import ImportManager
 
 logger = logging.getLogger("command")
 
 
 class Command(BaseCommand):
+    """
+    This command is used to restore a channel from another Studio instance. This is for
+    development purposes only and should not be used in production.
+    """
+
     def add_arguments(self, parser):
         # ID of channel to read data from
         parser.add_argument("source_id", type=str)
 
         # ID of channel to write data to (can be same as source channel)
-        parser.add_argument("--target", help="restore channel db to TARGET CHANNEL ID")
-        parser.add_argument("--download-url", help="where to download db from")
-        parser.add_argument("--editor", help="add user as editor to channel")
+        parser.add_argument(
+            "--target",
+            help="A different channel ID for which to restore the channel. If not provided, the source channel ID will be used.",
+        )
+        parser.add_argument(
+            "--source-url",
+            default="http://localhost:8080",
+            help="Studio instance from which to download the channel DB or content files",
+        )
+        parser.add_argument("--token", help="API token for the Studio instance")
+        parser.add_argument(
+            "--editor",
+            default="a@a.com",
+            help="Add user as editor to channel with provided email address",
+        )
+        parser.add_argument(
+            "--download-content",
+            action="store_true",
+            default=False,
+            help="Whether to download content files",
+        )
+        parser.add_argument(
+            "--public",
+            action="store_true",
+            default=False,
+            help="Whether to make the channel public",
+        )
+        parser.add_argument(
+            "--publish",
+            action="store_true",
+            default=False,
+            help="Whether to publish the channel after restoration",
+        )
 
     def handle(self, *args, **options):
-        # Set up variables for restoration process
         logger.info("\n\n********** STARTING CHANNEL RESTORATION **********")
-        source_id = options["source_id"]
-        target_id = options.get("target") or source_id
-        download_url = options.get("download_url")
-        editor = options.get("editor")
-
-        import_channel(source_id, target_id, download_url, editor, logger=logger)
+        manager = ImportManager(
+            options["source_url"],
+            options["source_id"],
+            target_id=options.get("target"),
+            editor=options.get("editor"),
+            public=options.get("public"),
+            publish=options.get("publish"),
+            token=options.get("token"),
+            download_content=options.get("download_content"),
+        )
+        manager.run()
diff --git a/contentcuration/contentcuration/tests/test_restore_channel.py b/contentcuration/contentcuration/tests/test_restore_channel.py
deleted file mode 100644
index 6c5e1500ff..0000000000
--- a/contentcuration/contentcuration/tests/test_restore_channel.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# -*- coding: utf-8 -*-
-import datetime
-import json
-import uuid
-from io import BytesIO
-
-from django.core.files.storage import default_storage
-from django.template.loader import render_to_string
-from django.utils.translation import activate
-from django.utils.translation import deactivate
-from le_utils.constants import exercises
-from mixer.backend.django import mixer
-from mock import MagicMock
-from mock import patch
-
-from .base import StudioTestCase
-from contentcuration.models import AssessmentItem
-from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.import_tools import create_channel
-from contentcuration.utils.import_tools import generate_assessment_item
-from contentcuration.utils.import_tools import process_content
-
-
-thumbnail_path = "/content/thumbnail.png"
-ASSESSMENT_DATA = {
-    "input-question-test": {
-        "template": "perseus/input_question.json",
-        "type": exercises.INPUT_QUESTION,
-        "question": "Input question",
-        "question_images": [{"name": "test.jpg", "width": 12.71, "height": 12.12}],
-        "hints": [{"hint": "Hint 1"}],
-        "answers": [
-            {"answer": "1", "correct": True, "images": []},
-            {"answer": "2", "correct": True, "images": []},
-        ],
-        "order": 0,
-    },
-    "multiple-selection-test": {
-        "template": "perseus/multiple_selection.json",
-        "type": exercises.MULTIPLE_SELECTION,
-        "question": "Multiple selection question",
-        "question_images": [],
-        "hints": [],
-        "answers": [
-            {"answer": "A", "correct": True, "images": []},
-            {"answer": "B", "correct": True, "images": []},
-            {"answer": "C", "correct": False, "images": []},
-        ],
-        "multiple_select": True,
-        "order": 1,
-        "randomize": False,
-    },
-    "single-selection-test": {
-        "template": "perseus/multiple_selection.json",
-        "type": exercises.SINGLE_SELECTION,
-        "question": "Single select question",
-        "question_images": [],
-        "hints": [{"hint": "Hint test"}],
-        "answers": [
-            {"answer": "Correct answer", "correct": True, "images": []},
-            {"answer": "Incorrect answer", "correct": False, "images": []},
-        ],
-        "multiple_select": False,
-        "order": 2,
-        "randomize": True,
-    },
-    "perseus-question-test": {
-        "template": "perseus/perseus_question.json",
-        "type": exercises.PERSEUS_QUESTION,
-        "order": 3,
-        "raw_data": "{}",
-    },
-}
-
-
-class ChannelRestoreUtilityFunctionTestCase(StudioTestCase):
-    @patch(
-        "contentcuration.utils.import_tools.write_to_thumbnail_file",
-        return_value=thumbnail_path,
-    )
-    def setUp(self, thumb_mock):
-        self.id = uuid.uuid4().hex
-        self.name = "test name"
-        self.description = "test description"
-        self.thumbnail_encoding = "base64 string"
-        self.root_pk = uuid.uuid4()
-        self.version = 7
-        self.last_updated = datetime.datetime.now()
-        self.cursor_mock = MagicMock()
-        self.cursor_mock.execute.return_value.fetchone.return_value = (
-            self.id,
-            self.name,
-            self.description,
-            self.thumbnail_encoding,
-            self.root_pk,
-            self.version,
-            self.last_updated,
-        )
-        self.channel, _ = create_channel(self.cursor_mock, self.id, self.admin_user)
-
-    def test_restore_channel_id(self):
-        self.assertEqual(self.channel.id, self.id)
-
-    def test_restore_channel_name(self):
-        self.assertEqual(self.channel.name, self.name)
-
-    def test_restore_channel_description(self):
-        self.assertEqual(self.channel.description, self.description)
-
-    def test_restore_channel_thumbnail(self):
-        self.assertEqual(self.channel.thumbnail, thumbnail_path)
-
-    def test_restore_channel_thumbnail_encoding(self):
-        self.assertEqual(
-            self.channel.thumbnail_encoding["base64"], self.thumbnail_encoding
-        )
-
-    def test_restore_channel_version(self):
-        self.assertEqual(self.channel.version, self.version)
-
-
-class PerseusRestoreTestCase(StudioTestCase):
-    def setUp(self):
-        super(PerseusRestoreTestCase, self).setUp()
-        image_path = generate_object_storage_name("test", "test.png")
-        default_storage.save(image_path, BytesIO(b"test"))
-
-    def test_process_content(self):
-        tests = [
-            {"content": "test 1", "output": "test 1", "images": {}},
-            {
-                "content": "test 2 ![test](${☣ LOCALPATH}/images/test.png)",
-                "output": "test 2 ![test](${☣ CONTENTSTORAGE}/test.png)",
-                "images": {},
-            },
-            {
-                "content": "test 3 ![](${☣ LOCALPATH}/images/test.png)",
-                "output": "test 3 ![](${☣ CONTENTSTORAGE}/test.png =50x50)",
-                "images": {
-                    "${☣ LOCALPATH}/images/test.png": {"width": 50, "height": 50}
-                },
-            },
-            {
-                "content": "test 4 ![](${☣ LOCALPATH}/images/test.png) ![](${☣ LOCALPATH}/images/test.png)",
-                "output": "test 4 ![](${☣ CONTENTSTORAGE}/test.png) ![](${☣ CONTENTSTORAGE}/test.png)",
-                "images": {},
-            },
-            {
-                "content": "test 5  $\\sqrt{36}+\\frac{1}{2}$ ",
-                "output": "test 5 $$\\sqrt{36}+\\frac{1}{2}$$",
-                "images": {},
-            },
-            {
-                "content": "test 6 $\\frac{1}{2}$ $\\frac{3}{2}$",
-                "output": "test 6 $$\\frac{1}{2}$$ $$\\frac{3}{2}$$",
-                "images": {},
-            },
-        ]
-        for test in tests:
-            result = process_content(test, mixer.blend(AssessmentItem))
-            self.assertEqual(result, test["output"])
-
-    def test_generate_assessment_item(self):
-        # Run in Spanish to ensure we are properly creating JSON with non-localized numbers
-        activate("es-es")
-        for assessment_id, data in list(ASSESSMENT_DATA.items()):
-            assessment_data = json.loads(
-                render_to_string(data["template"], data).encode("utf-8", "ignore")
-            )
-            assessment_item = generate_assessment_item(
-                assessment_id, data["order"], data["type"], assessment_data
-            )
-            self.assertEqual(assessment_item.type, data["type"])
-            self.assertEqual(assessment_item.question, data.get("question", ""))
-            self.assertEqual(assessment_item.randomize, bool(data.get("randomize")))
-            self.assertEqual(assessment_item.raw_data, data.get("raw_data", ""))
-            for hint in json.loads(assessment_item.hints):
-                self.assertTrue(
-                    any(h for h in data["hints"] if h["hint"] == hint["hint"])
-                )
-            for answer in json.loads(assessment_item.answers):
-                self.assertTrue(
-                    any(
-                        a
-                        for a in data["answers"]
-                        if a["answer"] == str(answer["answer"])
-                        and a["correct"] == answer["correct"]
-                    )
-                )
-        deactivate()
diff --git a/contentcuration/contentcuration/utils/files.py b/contentcuration/contentcuration/utils/files.py
index 0cb447a601..18f21dd702 100644
--- a/contentcuration/contentcuration/utils/files.py
+++ b/contentcuration/contentcuration/utils/files.py
@@ -85,12 +85,13 @@ def duplicate_file(
     return file_copy
 
 
-def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
+def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH, input_buffer=None):
     """
     Generates a base64 encoding for a thumbnail
     Args:
         filename (str): thumbnail to generate encoding from (must be in storage already)
         dimension (int, optional): desired width of thumbnail. Defaults to 400.
+        input_buffer (BytesIO, optional): buffer to read from. Defaults to None.
     Returns base64 encoding of resized thumbnail
     """
 
@@ -103,23 +104,23 @@ def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
     # make sure the aspect ratio between width and height is 16:9
     thumbnail_size = [dimension, round(dimension / 1.77)]
     try:
-        if not filename.startswith(settings.STATIC_ROOT):
-            filename = generate_object_storage_name(checksum, filename)
-            inbuffer = default_storage.open(filename, "rb")
-
-        else:
-            # Normalize the path and ensure it is indeed within STATIC_ROOT
-            normalized_path = os.path.normpath(filename)
-            static_root = os.path.abspath(settings.STATIC_ROOT)
-            abs_path = os.path.abspath(normalized_path)
-            if not abs_path.startswith(static_root + os.sep):
-                raise ValueError("Attempted access to file outside of STATIC_ROOT")
-            inbuffer = open(abs_path, "rb")
-
-        if not inbuffer:
+        if not input_buffer:
+            if not filename.startswith(settings.STATIC_ROOT):
+                filename = generate_object_storage_name(checksum, filename)
+                input_buffer = default_storage.open(filename, "rb")
+            else:
+                # Normalize the path and ensure it is indeed within STATIC_ROOT
+                normalized_path = os.path.normpath(filename)
+                static_root = os.path.abspath(settings.STATIC_ROOT)
+                abs_path = os.path.abspath(normalized_path)
+                if not abs_path.startswith(static_root + os.sep):
+                    raise ValueError("Attempted access to file outside of STATIC_ROOT")
+                input_buffer = open(filename, "rb")
+
+        if not input_buffer:
             raise AssertionError
 
-        with Image.open(inbuffer) as image:
+        with Image.open(input_buffer) as image:
             image_format = image.format
 
             # Note: Image.thumbnail ensures that the image will fit in the
@@ -136,7 +137,7 @@ def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
     finally:
         # Try to close the inbuffer if it has been created
         try:
-            inbuffer.close()
+            input_buffer.close()
         except UnboundLocalError:
             pass
         outbuffer.close()
diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index 0a187ce4c9..8f094ed7eb 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -1,30 +1,37 @@
 # -*- coding: utf-8 -*-
 import datetime
+import hashlib
 import json
 import logging
 import os
 import re
-import shutil
 import sqlite3
 import sys
 import tempfile
-import zipfile
+from functools import cached_property
 from io import BytesIO
 
 import requests
-from django.conf import settings
 from django.core.files.storage import default_storage
+from django.core.management import call_command
 from django.db import transaction
+from kolibri_content.router import get_active_content_database
+from kolibri_content.router import using_content_database
+from le_utils.constants import completion_criteria
 from le_utils.constants import content_kinds
 from le_utils.constants import exercises
 from le_utils.constants import format_presets
+from le_utils.constants import mastery_criteria
 from le_utils.constants import roles
+from le_utils.constants.labels import learning_activities
 
 from contentcuration import models
-from contentcuration.api import write_raw_content_to_storage
 from contentcuration.utils.files import create_file_from_contents
+from contentcuration.utils.files import get_thumbnail_encoding
 from contentcuration.utils.files import write_base64_to_file
 from contentcuration.utils.garbage_collect import get_deleted_chefs_root
+from contentcuration.utils.publish import publish_channel
+from contentcuration.viewsets.assessmentitem import exercise_image_filename_regex
 
 
 CHANNEL_TABLE = "content_channelmetadata"
@@ -47,126 +54,33 @@
 log = logging.getLogger(__name__)
 
 
-def import_channel(
-    source_id, target_id=None, download_url=None, editor=None, logger=None
-):
-    """
-    Import a channel from another Studio instance. This can be used to
-    copy online Studio channels into local machines for development,
-    testing, faster editing, or other purposes.
-
-    :param source_id: The UUID of the channel to import from the source Studio instance.
-    :param target_id: The UUID of the channel on the local instance. Defaults to source_id.
-    :param download_url: The URL of the Studio instance to import from.
-    :param editor: The email address of the user you wish to add as an editor, if any.
+class ImportClient(requests.Session):
+    def __init__(self, base_url, api_token=None):
+        super(ImportClient, self).__init__()
+        self.base_url = base_url
+        self.api_token = api_token
 
-    """
+    def __getattr__(self, name):
+        if name.endswith("_with_token"):
+            if not self.api_token:
+                raise ValueError("API token is required for this method.")
 
-    global log
-    if logger:
-        log = logger
-    else:
-        log = logging.getLogger(__name__)
-
-    # Set up variables for the import process
-    log.info("\n\n********** STARTING CHANNEL IMPORT **********")
-    start = datetime.datetime.now()
-    target_id = target_id or source_id
-
-    # Test connection to database
-    log.info("Connecting to database for channel {}...".format(source_id))
-
-    tempf = tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False)
-    conn = None
-    try:
-        if download_url:
-            response = requests.get(
-                "{}/content/databases/{}.sqlite3".format(download_url, source_id)
+            target_method = getattr(
+                super(ImportClient, self), name.replace("_with_token", "")
             )
-            for chunk in response:
-                tempf.write(chunk)
-        else:
-            filepath = "/".join([settings.DB_ROOT, "{}.sqlite3".format(source_id)])
-            # Check if database exists
-            if not default_storage.exists(filepath):
-                raise IOError("The object requested does not exist.")
-            with default_storage.open(filepath) as fobj:
-                shutil.copyfileobj(fobj, tempf)
-
-        tempf.close()
-        conn = sqlite3.connect(tempf.name)
-        cursor = conn.cursor()
-
-        # Start by creating channel
-        log.info("Creating channel...")
-        editor = models.User.objects.get(email=editor)
-        channel, root_pk = create_channel(conn, target_id, editor)
-        channel.editors.add(editor)
-        channel.save()
-
-        # Create root node
-        root = models.ContentNode.objects.create(
-            node_id=root_pk,
-            title=channel.name,
-            kind_id=content_kinds.TOPIC,
-            original_channel_id=target_id,
-            source_channel_id=target_id,
-        )
-
-        # Create nodes mapping to channel
-        log.info("   Creating nodes...")
-        with transaction.atomic():
-            create_nodes(cursor, target_id, root, download_url=download_url)
-            # TODO: Handle prerequisites
-
-        # Delete the previous tree if it exists
-        old_previous = channel.previous_tree
-        if old_previous:
-            old_previous.parent = get_deleted_chefs_root()
-            old_previous.title = "Old previous tree for channel {}".format(channel.pk)
-            old_previous.save()
-
-        # Save tree to target tree
-        channel.previous_tree = channel.main_tree
-        channel.main_tree = root
-        channel.save()
-    finally:
-        conn and conn.close()
-        tempf.close()
-        os.unlink(tempf.name)
-
-    # Print stats
-    log.info(
-        "\n\nChannel has been imported (time: {ms})\n".format(
-            ms=datetime.datetime.now() - start
+            token_headers = {
+                "Authorization": f"Token {self.api_token}",
+            }
+            return lambda url, *args, **kwargs: target_method(
+                url, *args, headers=token_headers, **kwargs
+            )
+        raise AttributeError(
+            f"'{self.__class__.__name__}' object has no attribute '{name}'"
         )
-    )
-    log.info("\n\n********** IMPORT COMPLETE **********\n\n")
 
-
-def create_channel(cursor, target_id, editor):
-    """create_channel: Create channel at target id
-    Args:
-        cursor (sqlite3.Connection): connection to export database
-        target_id (str): channel_id to write to
-    Returns: channel model created and id of root node
-    """
-    id, name, description, thumbnail, root_pk, version, last_updated = cursor.execute(
-        "SELECT id, name, description, thumbnail, root_pk, version, last_updated FROM {table}".format(
-            table=CHANNEL_TABLE
-        )
-    ).fetchone()
-    channel, is_new = models.Channel.objects.get_or_create(
-        pk=target_id, actor_id=editor.id
-    )
-    channel.name = name
-    channel.description = description
-    channel.thumbnail = write_to_thumbnail_file(thumbnail)
-    channel.thumbnail_encoding = {"base64": thumbnail, "points": [], "zoom": 0}
-    channel.version = version
-    channel.save()
-    log.info("\tCreated channel {} with name {}".format(target_id, name))
-    return channel, root_pk
+    def request(self, method, url, *args, **kwargs):
+        url = f"{self.base_url}{url}"
+        return super(ImportClient, self).request(method, url, *args, **kwargs)
 
 
 def write_to_thumbnail_file(raw_thumbnail):
@@ -195,446 +109,617 @@ def write_to_thumbnail_file(raw_thumbnail):
                 os.unlink(tempf.name)
 
 
-def create_nodes(cursor, target_id, parent, indent=1, download_url=None):
-    """create_channel: Create channel at target id
-    Args:
-        cursor (sqlite3.Connection): connection to export database
-        target_id (str): channel_id to write to
-        parent (models.ContentNode): node's parent
-        indent (int): How far to indent print statements
-    Returns: newly created node
+def convert_metadata_to_dict(metadata):
     """
-    # Read database rows that match parent
-    parent_query = "parent_id='{}'".format(parent.node_id)
-
-    sql_command = (
-        "SELECT id, title, content_id, description, sort_order, "
-        "license_owner, author, license_id, kind, coach_content, lang_id FROM {table} WHERE {query} ORDER BY sort_order;".format(
-            table=NODE_TABLE, query=parent_query
-        )
-    )
-    query = cursor.execute(sql_command).fetchall()
-
-    # Parse through rows and create models
-    for (
-        id,
-        title,
-        content_id,
-        description,
-        sort_order,
-        license_owner,
-        author,
-        license_id,
-        kind,
-        coach_content,
-        lang_id,
-    ) in query:
-        log.info(
-            "{indent} {id} ({title} - {kind})...".format(
-                indent="   |" * indent, id=id, title=title, kind=kind
-            )
-        )
-
-        # Determine role
-        role = roles.LEARNER
-        if coach_content:
-            role = roles.COACH
+    Convert metadata from a string to a dictionary.
 
-        # Determine extra_fields
-        assessment_query = "SELECT mastery_model, randomize FROM {table} WHERE contentnode_id='{node}'".format(
-            table=ASSESSMENTMETADATA_TABLE, node=id
-        )
-        result = cursor.execute(assessment_query).fetchone()
-        extra_fields = result[0] if result else {}
-        if isinstance(extra_fields, str):
-            extra_fields = json.loads(extra_fields)
-        if result:
-            extra_fields.update({"randomize": result[1]})
-
-        # Determine license
-        license = retrieve_license(cursor, license_id)
-        license_description = license[1] if license else ""
-        license = license[0] if license else None
-
-        # TODO: Determine thumbnail encoding
-
-        # Create new node model
-        node = models.ContentNode.objects.create(
-            node_id=id,
-            original_source_node_id=id,
-            source_node_id=id,
-            title=title,
-            content_id=content_id,
-            description=description,
-            sort_order=sort_order,
-            copyright_holder=license_owner,
-            author=author,
-            license=license,
-            license_description=license_description,
-            language_id=lang_id,
-            role_visibility=role,
-            extra_fields=extra_fields,
-            kind_id=kind,
-            parent=parent,
-            original_channel_id=target_id,
-            source_channel_id=target_id,
-        )
-
-        # Handle foreign key references (children, files, tags)
-        if kind == content_kinds.TOPIC:
-            create_nodes(
-                cursor, target_id, node, indent=indent + 1, download_url=download_url
-            )
-        elif kind == content_kinds.EXERCISE:
-            create_assessment_items(
-                cursor, node, indent=indent + 1, download_url=download_url
-            )
-        create_files(cursor, node, indent=indent + 1, download_url=download_url)
-        create_tags(cursor, node, target_id, indent=indent + 1)
+    :param metadata: The metadata string to convert.
+    :return: A dictionary representation of the metadata.
+    """
+    if isinstance(metadata, str):
+        metadata_split = metadata.split(",")
+        return {metadata_key: True for metadata_key in metadata_split}
+    return metadata
 
-    return node
 
+def convert_learning_activities_to_dict(content_kind, metadata):
+    """
+    Convert learning activities from a string to a dictionary.
 
-def retrieve_license(cursor, license_id):
-    """retrieve_license_name: Get license based on id from exported db
-    Args:
-        cursor (sqlite3.Connection): connection to export database
-        license_id (str): id of license on exported db
-    Returns: license model matching the name and the associated license description
+    :param content_kind: The content kind of the learning activities.
+    :param metadata: The learning activities string to convert.
+    :return: A dictionary representation of the learning activities.
     """
-    # Handle no license being assigned
-    if license_id is None or license_id == "":
+    metadata = convert_metadata_to_dict(metadata)
+    if isinstance(metadata, dict):
+        return metadata
+
+    if content_kind == content_kinds.EXERCISE:
+        return {learning_activities.PRACTICE: True}
+    elif content_kind in [content_kinds.HTML5, content_kinds.H5P]:
+        return {learning_activities.EXPLORE: True}
+    elif content_kind == content_kinds.AUDIO:
+        return {learning_activities.LISTEN: True}
+    elif content_kind == content_kinds.VIDEO:
+        return {learning_activities.WATCH: True}
+    elif content_kind == content_kinds.DOCUMENT:
+        return {learning_activities.READ: True}
+    elif content_kind == content_kinds.SLIDESHOW:
+        return {learning_activities.READ: True}
+    elif content_kind == content_kinds.TOPIC:
         return None
+    return {learning_activities.EXPLORE: True}
 
-    # Return license that matches name
-    name, description = cursor.execute(
-        "SELECT license_name, license_description FROM {table} WHERE id={id}".format(
-            table=LICENSE_TABLE, id=license_id
-        )
-    ).fetchone()
-    return models.License.objects.get(license_name=name), description
-
-
-def download_file(
-    filename,
-    download_url=None,
-    contentnode=None,
-    assessment_item=None,
-    preset=None,
-    file_size=None,
-    lang_id=None,
-):
-    checksum, extension = os.path.splitext(filename)
-    extension = extension.lstrip(".")
-    filepath = models.generate_object_storage_name(checksum, filename)
-
-    # Download file if it hasn't already been downloaded
-    if download_url and not default_storage.exists(filepath):
-        buffer = BytesIO()
-        response = requests.get(
-            "{}/content/storage/{}/{}/{}".format(
-                download_url, filename[0], filename[1], filename
-            )
-        )
-        for chunk in response:
-            buffer.write(chunk)
 
-        checksum, _, filepath = write_raw_content_to_storage(
-            buffer.getvalue(), ext=extension
-        )
-        buffer.close()
-
-    # Save values to new file object
-    file_obj = models.File(
-        file_format_id=extension,
-        file_size=file_size or default_storage.size(filepath),
-        contentnode=contentnode,
-        assessment_item=assessment_item,
-        language_id=lang_id,
-        preset_id=preset or "",
-    )
-    file_obj.file_on_disk.name = filepath
-    file_obj.save()
-
-
-def create_files(cursor, contentnode, indent=0, download_url=None):
-    """create_files: Get license
-    Args:
-        cursor (sqlite3.Connection): connection to export database
-        contentnode (models.ContentNode): node file references
-        indent (int): How far to indent print statements
-    Returns: None
+class ImportManager(object):
+    """
+    Import a channel from another Studio instance. This can be used to copy online Studio channels
+    into local machines for development, testing, faster editing, or other purposes.
     """
-    # Parse database for files referencing content node and make file models
-    sql_command = (
-        "SELECT checksum, extension, file_size, contentnode_id, "
-        "lang_id, preset FROM {table} WHERE contentnode_id='{id}';".format(
-            table=FILE_TABLE, id=contentnode.node_id
-        )
-    )
-
-    query = cursor.execute(sql_command).fetchall()
-    for checksum, extension, file_size, contentnode_id, lang_id, preset in query:
-        filename = "{}.{}".format(checksum, extension)
-        log.info(
-            "{indent} * FILE {filename}...".format(
-                indent="   |" * indent, filename=filename
-            )
-        )
 
-        try:
-            download_file(
-                filename,
-                download_url=download_url,
-                contentnode=contentnode,
-                preset=preset,
-                file_size=file_size,
-                lang_id=lang_id,
+    def __init__(
+        self,
+        source_url,
+        source_id,
+        target_id=None,
+        editor=None,
+        public=False,
+        publish=False,
+        token=None,
+        download_content=True,
+        logger=None,
+    ):
+        self.source_id = source_id
+        self.target_id = target_id or source_id
+        self.source_url = source_url
+        self.editor = editor
+        self.public = public
+        self.publish = publish
+        self.token = token
+        self.download_content = download_content
+        self.logger = logger or logging.getLogger(__name__)
+        self.client = ImportClient(source_url, api_token=token)
+        self.conn = None
+        self.cursor = None
+        self.schema_version = None
+
+    @cached_property
+    def editor_user(self):
+        """
+        Get the User object for the editor email address.
+
+        :return: The User object for the editor.
+        """
+        return models.User.objects.get(email=self.editor) if self.editor else None
+
+    def run(self):
+        """
+        Run the import process.
+        """
+        # Set up variables for the import process
+        self.logger.info("\n\n********** STARTING CHANNEL IMPORT **********")
+        start = datetime.datetime.now()
+
+        if not self.token:
+            self.logger.warning(
+                "No API token provided. This may result in limited functionality."
             )
 
-        except IOError as e:
-            log.warning("\b FAILED (check logs for more details)")
-            sys.stderr.write(
-                "Restoration Process Error: Failed to save file object {}: {}".format(
-                    filename, os.strerror(e.errno)
-                )
-            )
-            continue
+        # Test connection to the database
+        self.logger.info(f"Connecting to database for channel {self.source_id}...")
 
+        tempf = tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False)
+        try:
+            response = self.client.get(f"/content/databases/{self.source_id}.sqlite3")
+            for chunk in response:
+                tempf.write(chunk)
 
-def create_tags(cursor, contentnode, target_id, indent=0):
-    """create_tags: Create tags associated with node
-    Args:
-        cursor (sqlite3.Connection): connection to export database
-        contentnode (models.ContentNode): node file references
-        target_id (str): channel_id to write to
-        indent (int): How far to indent print statements
-    Returns: None
-    """
-    # Parse database for files referencing content node and make file models
-    sql_command = (
-        "SELECT ct.id, ct.tag_name FROM {cnttable} cnt "
-        "JOIN {cttable} ct ON cnt.contenttag_id = ct.id "
-        "WHERE cnt.contentnode_id='{id}';".format(
-            cnttable=NODE_TAG_TABLE,
-            cttable=TAG_TABLE,
-            id=contentnode.node_id,
-        )
-    )
-    query = cursor.execute(sql_command).fetchall()
-
-    # Build up list of tags
-    tag_list = []
-    for id, tag_name in query:
-        log.info(
-            "{indent} ** TAG {tag}...".format(indent="   |" * indent, tag=tag_name)
-        )
-        # Save values to new or existing tag object
-        tag_obj, is_new = models.ContentTag.objects.get_or_create(
-            pk=id,
-            tag_name=tag_name,
-            channel_id=target_id,
-        )
-        tag_list.append(tag_obj)
+            tempf.close()
 
-    # Save tags to node
-    contentnode.tags.set(tag_list)
-    contentnode.save()
+            with using_content_database(tempf.name):
+                call_command(
+                    "migrate",
+                    "content",
+                    database=get_active_content_database(),
+                    no_input=True,
+                )
 
+            self.conn = sqlite3.connect(tempf.name)
+            self.cursor = self.conn.cursor()
+
+            # Start by creating the channel
+            self.logger.info("Creating channel...")
+            channel, root_pk = self._create_channel()
+            channel.editors.add(self.editor_user)
+            channel.save()
+
+            # Create the root node
+            root = models.ContentNode.objects.create(
+                node_id=root_pk,
+                title=channel.name,
+                kind_id=content_kinds.TOPIC,
+                original_channel_id=self.target_id,
+                source_channel_id=self.target_id,
+                complete=True,
+            )
 
-def create_assessment_items(cursor, contentnode, indent=0, download_url=None):
-    """create_assessment_items: Generate assessment items based on perseus zip
-    Args:
-        cursor (sqlite3.Connection): connection to export database
-        contentnode (models.ContentNode): node assessment items reference
-        indent (int): How far to indent print statements
-        download_url (str): Domain to download files from
-    Returns: None
-    """
+            # Create nodes mapping to channel
+            self.logger.info("   Creating nodes...")
+            with transaction.atomic():
+                self._create_nodes(root)
+                # TODO: Handle prerequisites
+
+            # Delete the previous tree if it exists
+            old_previous = channel.previous_tree
+            if old_previous:
+                old_previous.parent = get_deleted_chefs_root()
+                old_previous.title = f"Old previous tree for channel {channel.pk}"
+                old_previous.save()
+
+            # Save the new tree to the target tree, and preserve the old one
+            channel.previous_tree = channel.main_tree
+            channel.main_tree = root
+            channel.save()
+        finally:
+            self.conn and self.conn.close()
+            tempf.close()
+            os.unlink(tempf.name)
+
+        # Publish the channel if requested
+        if self.publish:
+            self.logger.info("Publishing channel...")
+            publish_channel(self.editor_user.id, channel.id)
 
-    # Parse database for files referencing content node and make file models
-    sql_command = (
-        "SELECT checksum, extension "
-        "preset FROM {table} WHERE contentnode_id='{id}' AND preset='exercise';".format(
-            table=FILE_TABLE, id=contentnode.node_id
+        # Print stats
+        self.logger.info(
+            f"\n\nChannel has been imported (time: {datetime.datetime.now() - start})\n"
         )
-    )
-
-    query = cursor.execute(sql_command).fetchall()
-    for checksum, extension in query:
-        filename = "{}.{}".format(checksum, extension)
-        log.info(
-            "{indent} * EXERCISE {filename}...".format(
-                indent="   |" * indent, filename=filename
-            )
+        self.logger.info("\n\n********** IMPORT COMPLETE **********\n\n")
+
+    def _create_channel(self):
+        """
+        Create the channel at target id
+        """
+        (
+            id,
+            name,
+            description,
+            thumbnail,
+            root_pk,
+            version,
+            last_updated,
+            schema_version,
+        ) = self.cursor.execute(
+            f"""
+                    SELECT
+                        id, name, description, thumbnail, root_pk, version, last_updated,
+                        min_schema_version
+                    FROM {CHANNEL_TABLE}
+                """
+        ).fetchone()
+        lang_id, _ = self.cursor.execute(
+            f"""
+                SELECT lang_id, COUNT(id) AS node_by_lang_count
+                FROM {NODE_TABLE}
+                ORDER BY node_by_lang_count DESC
+            """
+        ).fetchone()
+        channel, is_new = models.Channel.objects.get_or_create(
+            pk=self.target_id, actor_id=self.editor_user.id
         )
-
-        try:
-            # Store the downloaded zip into temporary storage
-            tempf = tempfile.NamedTemporaryFile(
-                suffix=".{}".format(extension), delete=False
-            )
-            response = requests.get(
-                "{}/content/storage/{}/{}/{}".format(
-                    download_url, filename[0], filename[1], filename
+        channel.name = name
+        channel.description = description
+        channel.language_id = lang_id
+        channel.thumbnail = write_to_thumbnail_file(thumbnail)
+        channel.thumbnail_encoding = {"base64": thumbnail, "points": [], "zoom": 0}
+        channel.version = version
+        channel.public = self.public
+        channel.save()
+        self.logger.info(f"\tCreated channel {self.target_id} with name {name}")
+        return channel, root_pk
+
+    def _create_nodes(self, parent, indent=1):
+        """
+        Create node(s) for a channel with target id
+
+        :param parent: node's parent
+        :param indent: How far to indent print statements
+        """
+        sql_command = f"""
+            SELECT
+                id, title, content_id, description, sort_order, license_owner, author, license_id,
+                kind, coach_content, lang_id, grade_levels, resource_types, learning_activities,
+                accessibility_labels, categories, learner_needs, duration, options
+            FROM {NODE_TABLE}
+            WHERE parent_id = ?
+            ORDER BY sort_order;
+        """
+        query = self.cursor.execute(
+            sql_command, (getattr(parent, "node_id", parent),)
+        ).fetchall()
+
+        # Parse through rows and create models
+        for (
+            id,
+            title,
+            content_id,
+            description,
+            sort_order,
+            license_owner,
+            author,
+            license_id,
+            kind,
+            coach_content,
+            lang_id,
+            grade_levels,
+            resource_types,
+            learning_activities_,
+            accessibility_labels,
+            categories,
+            learner_needs,
+            duration,
+            options,
+        ) in query:
+            self.logger.info(
+                "{indent} {id} ({title} - {kind})...".format(
+                    indent="   |" * indent, id=id, title=title, kind=kind
                 )
             )
-            for chunk in response:
-                tempf.write(chunk)
-            tempf.close()
-            extract_assessment_items(tempf.name, contentnode, download_url=download_url)
-        except IOError as e:
-            log.warning("\b FAILED (check logs for more details)")
-            sys.stderr.write(
-                "Restoration Process Error: Failed to save file object {}: {}".format(
-                    filename, os.strerror(e.errno)
+
+            # Determine role
+            role = roles.LEARNER
+            if coach_content:
+                role = roles.COACH
+
+            # Determine extra_fields
+            extra_fields = {}
+            if kind == content_kinds.EXERCISE:
+                randomize_sql = f"""
+                    SELECT randomize
+                    FROM {ASSESSMENTMETADATA_TABLE}
+                    WHERE contentnode_id = ?
+                """
+                randomize = self.cursor.execute(randomize_sql, (id,)).fetchone()
+                extra_fields["options"] = json.loads(options) if options else {}
+                extra_fields["randomize"] = bool(randomize[0]) if randomize else False
+                completion_criteria_ = extra_fields["options"].get(
+                    "completion_criteria"
                 )
+                if (
+                    completion_criteria_
+                    and completion_criteria_.get("model") == completion_criteria.MASTERY
+                ):
+                    mastery_model = completion_criteria_.get("threshold", {}).get(
+                        "mastery_model"
+                    )
+                    if mastery_model == mastery_criteria.DO_ALL:
+                        completion_criteria_["threshold"] = {
+                            "mastery_model": mastery_model,
+                        }
+                if (
+                    completion_criteria_
+                    and "learner_managed" not in completion_criteria_
+                ):
+                    completion_criteria_["learner_managed"] = False
+
+            # Determine license
+            license_result = self._retrieve_license(license_id)
+            license_description = license_result[1] if license_result else ""
+            license_result = license_result[0] if license_result else None
+
+            # TODO: Determine thumbnail encoding
+
+            # Create the new node model
+            node = models.ContentNode.objects.create(
+                node_id=id,
+                original_source_node_id=id,
+                source_node_id=id,
+                title=title,
+                content_id=content_id,
+                description=description,
+                sort_order=sort_order,
+                copyright_holder=license_owner,
+                author=author,
+                license=license_result,
+                license_description=license_description,
+                language_id=lang_id,
+                role_visibility=role,
+                extra_fields=extra_fields,
+                kind_id=kind,
+                parent=parent,
+                original_channel_id=self.target_id,
+                source_channel_id=self.target_id,
+                grade_levels=convert_metadata_to_dict(grade_levels),
+                resource_types=convert_metadata_to_dict(resource_types),
+                learning_activities=convert_learning_activities_to_dict(
+                    kind, learning_activities_
+                ),
+                accessibility_labels=convert_metadata_to_dict(accessibility_labels),
+                categories=convert_metadata_to_dict(categories),
+                learner_needs=convert_metadata_to_dict(learner_needs),
             )
-            continue
-        finally:
-            os.unlink(tempf.name)
-
 
-def extract_assessment_items(filepath, contentnode, download_url=None):
-    """extract_assessment_items: Create and save assessment items to content node
-    Args:
-        filepath (str): Where perseus zip is stored
-        contentnode (models.ContentNode): node assessment items reference
-        download_url (str): Domain to download files from
-    Returns: None
-    """
+            # Handle foreign key references (children, files, tags)
+            if kind == content_kinds.TOPIC:
+                self._create_nodes(node, indent=indent + 1)
+            elif kind == content_kinds.EXERCISE:
+                self._create_assessment_items(node, indent=indent + 1)
+            self._create_files(node, indent=indent + 1)
+            self._create_tags(node, indent=indent + 1)
+
+            errors = node.mark_complete()
+            if errors:
+                self.logger.warning(f"Node {node.node_id} has errors: {errors}")
+            node.save()
+
+    def _retrieve_license(self, license_id):
+        """
+        Get license based on id from exported db
+
+        :param license_id: id of license on exported db
+        :return: license model matching the id and the associated license description
+        :rtype: tuple
+        """
+        # Handle no license being assigned
+        if license_id is None or license_id == "":
+            return None
+
+        # Return license that matches name
+        name, description = self.cursor.execute(
+            f"""
+                SELECT license_name, license_description
+                FROM {LICENSE_TABLE}
+                WHERE id = ?
+            """,
+            (license_id,),
+        ).fetchone()
+        return models.License.objects.get(license_name=name), description
+
+    def _create_files(self, contentnode, indent=0):
+        """
+        Create and possibly download node files
+
+        :param contentnode: node file references
+        :param indent: How far to indent print statements
+        """
+        # Parse database for files referencing content node and make file models
+        sql_command = f"""
+            SELECT checksum, extension, file_size, contentnode_id, lang_id, preset, thumbnail
+            FROM {FILE_TABLE}
+            WHERE contentnode_id = ?;
+        """
+        query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
+
+        for (
+            checksum,
+            extension,
+            file_size,
+            contentnode_id,
+            lang_id,
+            preset,
+            is_thumbnail,
+        ) in query:
+            filename = "{}.{}".format(checksum, extension)
+            self.logger.info(
+                "{indent} * FILE {filename}...".format(
+                    indent="   |" * indent, filename=filename
+                )
+            )
 
-    try:
-        tempdir = tempfile.mkdtemp()
-        with zipfile.ZipFile(filepath, "r") as zipf:
-            zipf.extractall(tempdir)
-        os.chdir(tempdir)
-
-        with open("exercise.json", "rb") as fobj:
-            data = json.load(fobj)
-
-        for index, assessment_id in enumerate(data["all_assessment_items"]):
-            with open("{}.json".format(assessment_id), "rb") as fobj:
-                assessment_item = generate_assessment_item(
-                    assessment_id,
-                    index,
-                    data["assessment_mapping"][assessment_id],
-                    json.load(fobj),
-                    download_url=download_url,
+            try:
+                self._download_file(
+                    filename,
+                    contentnode=contentnode,
+                    preset=preset,
+                    file_size=file_size,
+                    lang_id=lang_id,
+                    is_thumbnail=is_thumbnail,
                 )
-                contentnode.assessment_items.add(assessment_item)
-    finally:
-        shutil.rmtree(tempdir)
+            except IOError as e:
+                self.logger.warning("\b FAILED (check logs for more details)")
+                if e.errno:
+                    sys.stderr.write(
+                        f"Restoration Process Error: Failed to save file object {filename}: {os.strerror(e.errno)}"
+                    )
+                continue
+
+    def _download_file(
+        self,
+        filename,
+        contentnode=None,
+        assessment_item=None,
+        preset=None,
+        file_size=None,
+        lang_id=None,
+        is_thumbnail=False,
+    ):
+        """
+        Create and possibly download a file from source instance and save to local storage
+
+        :param filename: the name of the file to download
+        :param contentnode: the associated content node
+        :param assessment_item: the associated assessment item
+        :param preset: the format preset for the file
+        :param file_size: the known size of the file
+        :param lang_id: the language ID of the file
+        :param is_thumbnail: whether the file is a thumbnail
+        """
+        checksum, extension = os.path.splitext(filename)
+        extension = extension.lstrip(".")
+        filepath = models.generate_object_storage_name(checksum, filename)
+
+        file_url = f"/content/storage/{filename[0]}/{filename[1]}/{filename}"
+        file_exists = False
+
+        # If the file already exists, get the size from the storage
+        if default_storage.exists(filepath):
+            file_size = file_size or default_storage.size(filepath)
+            file_exists = True
+        # if it needs downloading and if we were instructed to do so
+        elif self.download_content or (is_thumbnail and contentnode):
+            buffer = BytesIO()
+            response = self.client.get(file_url)
+            for chunk in response:
+                buffer.write(chunk)
 
+            if is_thumbnail and contentnode:
+                # If the file is a thumbnail, save it to the content node
+                contentnode.thumbnail_encoding = json.dumps(
+                    {
+                        "base64": get_thumbnail_encoding(filename, input_buffer=buffer),
+                        "points": [],
+                        "zoom": 0,
+                    }
+                )
+            else:
+                checksum = hashlib.md5()
+                checksum.update(buffer.getvalue())
+                hashed_filename = checksum.hexdigest()
+                full_filename = "{}.{}".format(hashed_filename, extension.lower())
+                filepath = models.generate_object_storage_name(
+                    hashed_filename, full_filename
+                )
 
-def generate_assessment_item(
-    assessment_id, order, assessment_type, assessment_data, download_url=None
-):
-    """generate_assessment_item: Generates a new assessment item
-    Args:
-        assessment_id (str): AssessmentItem.assessment_id value
-        order (Number): AssessmentItem.order value
-        assessment_type (str): AssessmentItem.type value
-        assessment_data (dict): Extracted data from perseus file
-        download_url (str): Domain to download files from
-    Returns: models.AssessmentItem
-    """
-    assessment_item = models.AssessmentItem.objects.create(
-        assessment_id=assessment_id, type=assessment_type, order=order
-    )
-    if assessment_type == exercises.PERSEUS_QUESTION:
-        assessment_item.raw_data = json.dumps(assessment_data)
-    else:
-        # Parse questions
-        assessment_data["question"]["content"] = "\n\n".join(
-            assessment_data["question"]["content"].split("\n\n")[:-1]
-        )
-        assessment_item.question = process_content(
-            assessment_data["question"], assessment_item, download_url=download_url
+                self.storage.save(filepath, buffer)
+                buffer.close()
+                file_exists = True
+        # otherwise, if file size is not known, get it from the response headers
+        elif not file_size:
+            response = self.client.head(file_url)
+            file_size = int(response.headers.get("Content-Length", 0))
+
+        # Save values to a new file object
+        file_obj = models.File(
+            file_format_id=extension,
+            file_size=file_size,
+            contentnode=contentnode,
+            assessment_item=assessment_item,
+            language_id=lang_id,
+            preset_id=preset or "",
+            checksum=checksum,
         )
-
-        # Parse answers
-        answer_data = assessment_data["question"]["widgets"][
-            ANSWER_FIELD_MAP[assessment_type]
-        ]["options"]
-        if assessment_type == exercises.INPUT_QUESTION:
-            assessment_item.answers = json.dumps(
-                [
-                    {"answer": answer["value"], "correct": True}
-                    for answer in answer_data["answers"]
-                ]
+        file_obj.file_on_disk.name = filepath
+        # set_by_file_on_disk: skip unless the file has been downloaded
+        file_obj.save(set_by_file_on_disk=file_exists)
+
+    def _create_tags(self, contentnode, indent=0):
+        """
+        Create tags associated with node
+
+        :param contentnode: node tags reference
+        :param indent: How far to indent print statements
+        """
+        # Parse database for files referencing content node and make file models
+        sql_command = f"""
+            SELECT ct.id, ct.tag_name
+            FROM {NODE_TAG_TABLE} cnt
+            JOIN {TAG_TABLE} ct ON cnt.contenttag_id = ct.id
+            WHERE cnt.contentnode_id = ?;
+        """
+        query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
+
+        # Build up list of tags
+        tag_list = []
+        for id, tag_name in query:
+            self.logger.info(
+                "{indent} ** TAG {tag}...".format(indent="   |" * indent, tag=tag_name)
             )
-        else:
-            assessment_item.answers = json.dumps(
-                [
-                    {
-                        "answer": process_content(
-                            answer, assessment_item, download_url=download_url
-                        ),
-                        "correct": answer["correct"],
-                    }
-                    for answer in answer_data["choices"]
-                ]
+            # Save values to new or existing tag object
+            tag_obj, is_new = models.ContentTag.objects.get_or_create(
+                pk=id,
+                tag_name=tag_name,
+                channel_id=self.target_id,
             )
-            assessment_item.randomize = answer_data["randomize"]
-
-        # Parse hints
-        assessment_item.hints = json.dumps(
-            [
-                {
-                    "hint": process_content(
-                        hint, assessment_item, download_url=download_url
-                    )
-                }
-                for hint in assessment_data["hints"]
-            ]
-        )
-
-    assessment_item.save()
-    return assessment_item
-
+            tag_list.append(tag_obj)
+
+        # Save tags to node
+        contentnode.tags.set(tag_list)
+        contentnode.save()
+
+    def _create_assessment_items(self, contentnode, indent=0):
+        """
+        Generate assessment items based on perseus zip
+
+        :param contentnode: node assessment items reference
+        :param indent: How far to indent print statements
+        """
+        if not self.token:
+            self.logger.warning(
+                f"Skipping assessment items for node {contentnode.node_id}"
+            )
+            return
 
-def process_content(data, assessment_item, download_url=None):
-    """process_content: Parses perseus text for special formatting (e.g. formulas, images)
-    Args:
-        data (dict): Perseus data to parse (e.g. parsing 'question' field)
-        download_url (str): Domain to download files from
-        assessment_item (models.AssessmentItem): assessment item to save images to
-    Returns: models.AssessmentItem
-    """
-    data["content"] = data["content"].replace(
-        " ", ""
-    )  # Remove unrecognized non unicode characters
-    # Process formulas
-    for match in re.finditer(r"(\$[^\$☣]+\$)", data["content"]):
-        data["content"] = data["content"].replace(
-            match.group(0), "${}$".format(match.group(0))
+        # first obtain the content node's Studio ID with the node ID
+        node_response = self.client.get_with_token(
+            f"/api/contentnode?_node_id_channel_id___in={contentnode.node_id},{self.source_id}"
         )
+        if node_response.status_code != 200:
+            self.logger.warning(
+                f"Failed to obtain assessment items for node {contentnode.node_id}"
+            )
+            return
 
-    # Process images
+        node_data = node_response.json()
+        contentnode_id = node_data[0]["id"] if node_data else None
+        if not contentnode_id:
+            self.logger.warning(f"No content node found for node {contentnode.node_id}")
+            return
 
-    for match in re.finditer(
-        r"!\[[^\]]*\]\((\$(\{☣ LOCALPATH\}\/images)\/([^\.]+\.[^\)]+))\)",
-        data["content"],
-    ):
-        data["content"] = data["content"].replace(
-            match.group(2), exercises.CONTENT_STORAGE_PLACEHOLDER
+        # Get the content node's assessment items
+        assessment_response = self.client.get_with_token(
+            f"/api/assessmentitem?contentnode__in={contentnode_id}"
         )
-        image_data = data["images"].get(match.group(1))
-        if image_data and image_data.get("width"):
-            data["content"] = data["content"].replace(
-                match.group(3),
-                "{} ={}x{}".format(
-                    match.group(3), image_data["width"], image_data["height"]
-                ),
+        if assessment_response.status_code != 200:
+            self.logger.warning(
+                f"Failed to obtain assessment items for node {contentnode.node_id}"
             )
+            return
 
-        # Save files to db
-        download_file(
-            match.group(3),
-            assessment_item=assessment_item,
-            preset=format_presets.EXERCISE,
-            download_url=download_url,
-        )
+        assessment_items = assessment_response.json()
+        if not assessment_items:
+            self.logger.warning(
+                f"No assessment items found for node {contentnode.node_id}"
+            )
+            return
 
-    return data["content"]
+        # Create the assessment items
+        for item in assessment_items:
+            self.logger.info(
+                "{indent} ** ASSESSMENT ITEM {assessment_id}...".format(
+                    indent="   |" * indent, assessment_id=item["assessment_id"]
+                )
+            )
+            assessment_item = models.AssessmentItem.objects.create(
+                assessment_id=item["assessment_id"],
+                type=item["type"],
+                order=item["order"],
+                question=item["question"],
+                answers=item["answers"],
+                hints=item["hints"],
+                randomize=item.get("randomize", False),
+            )
+            contentnode.assessment_items.add(assessment_item)
+        contentnode.save()
+
+    def _process_assessment_images(self, assessment_item):
+        """
+        Process images in assessment items and save them to the database.
+
+        :param assessment_item: The assessment item to process.
+        """
+        if not self.download_content:
+            # Skip if not downloading content
+            return
+
+        for content in [
+            assessment_item.question,
+            assessment_item.answers,
+            assessment_item.hints,
+        ]:
+            for match in re.finditer(exercise_image_filename_regex, content):
+                # Save files to db
+                self._download_file(
+                    match.group(3),
+                    assessment_item=assessment_item,
+                    preset=format_presets.EXERCISE,
+                )

From ecc9c34be63b8be944facd36d89f4c961aa91905 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 15:11:49 -0700
Subject: [PATCH 2/7] Improve logging and progress output, perform bulk
 handling of assessments

---
 .../management/commands/restore_channel.py    |   1 -
 .../contentcuration/utils/import_tools.py     | 207 +++++++++++-------
 requirements-dev.in                           |   1 +
 requirements-dev.txt                          |   6 +
 4 files changed, 131 insertions(+), 84 deletions(-)

diff --git a/contentcuration/contentcuration/management/commands/restore_channel.py b/contentcuration/contentcuration/management/commands/restore_channel.py
index 49b785f725..16b3976228 100644
--- a/contentcuration/contentcuration/management/commands/restore_channel.py
+++ b/contentcuration/contentcuration/management/commands/restore_channel.py
@@ -53,7 +53,6 @@ def add_arguments(self, parser):
         )
 
     def handle(self, *args, **options):
-        logger.info("\n\n********** STARTING CHANNEL RESTORATION **********")
         manager = ImportManager(
             options["source_url"],
             options["source_id"],
diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index 8f094ed7eb..c0d75207c5 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -12,6 +12,7 @@
 from io import BytesIO
 
 import requests
+import tqdm
 from django.core.files.storage import default_storage
 from django.core.management import call_command
 from django.db import transaction
@@ -181,7 +182,7 @@ def __init__(
         self.client = ImportClient(source_url, api_token=token)
         self.conn = None
         self.cursor = None
-        self.schema_version = None
+        self.progress = None
 
     @cached_property
     def editor_user(self):
@@ -194,10 +195,10 @@ def editor_user(self):
 
     def run(self):
         """
-        Run the import process.
+        Run the import restoration process.
         """
+        self.logger.info("********** STARTING CHANNEL RESTORATION **********")
         # Set up variables for the import process
-        self.logger.info("\n\n********** STARTING CHANNEL IMPORT **********")
         start = datetime.datetime.now()
 
         if not self.token:
@@ -243,10 +244,38 @@ def run(self):
                 complete=True,
             )
 
+            self.logger.info("Creating nodes...")
+            total_nodes = self.cursor.execute(
+                f"SELECT COUNT(*) FROM {NODE_TABLE}"
+            ).fetchone()[0]
+            node_progress = tqdm.tqdm(
+                total=total_nodes, desc="Restoring nodes", unit="node"
+            )
+
             # Create nodes mapping to channel
-            self.logger.info("   Creating nodes...")
             with transaction.atomic():
-                self._create_nodes(root)
+                self._create_nodes(root, node_progress)
+                node_progress.close()
+                self.logger.info("Creating assessment items...")
+                exercise_nodes = models.ContentNode.objects.filter(
+                    kind_id=content_kinds.EXERCISE, tree_id=root.tree_id
+                )
+                exercise_progress = tqdm.tqdm(
+                    total=exercise_nodes.count(),
+                    desc="Restoring assessments",
+                    unit="node",
+                )
+                chunk = []
+                for node in exercise_nodes.iterator(chunk_size=10):
+                    chunk.append(node)
+                    if len(chunk) >= 10:
+                        self._create_assessment_items(chunk)
+                        exercise_progress.update(len(chunk))
+                        chunk = []
+                if chunk:
+                    self._create_assessment_items(chunk)
+                    exercise_progress.update(len(chunk))
+                exercise_progress.close()
                 # TODO: Handle prerequisites
 
             # Delete the previous tree if it exists
@@ -272,9 +301,9 @@ def run(self):
 
         # Print stats
         self.logger.info(
-            f"\n\nChannel has been imported (time: {datetime.datetime.now() - start})\n"
+            f"Channel has been imported (time: {datetime.datetime.now() - start})"
         )
-        self.logger.info("\n\n********** IMPORT COMPLETE **********\n\n")
+        self.logger.info("********** IMPORT COMPLETE **********")
 
     def _create_channel(self):
         """
@@ -315,15 +344,15 @@ def _create_channel(self):
         channel.version = version
         channel.public = self.public
         channel.save()
-        self.logger.info(f"\tCreated channel {self.target_id} with name {name}")
+        self.logger.info(f"Created channel {self.target_id} with name {name}")
         return channel, root_pk
 
-    def _create_nodes(self, parent, indent=1):
+    def _create_nodes(self, parent, progress):
         """
         Create node(s) for a channel with target id
 
         :param parent: node's parent
-        :param indent: How far to indent print statements
+        :param progress: progress bar for node creation
         """
         sql_command = f"""
             SELECT
@@ -360,12 +389,6 @@ def _create_nodes(self, parent, indent=1):
             duration,
             options,
         ) in query:
-            self.logger.info(
-                "{indent} {id} ({title} - {kind})...".format(
-                    indent="   |" * indent, id=id, title=title, kind=kind
-                )
-            )
-
             # Determine role
             role = roles.LEARNER
             if coach_content:
@@ -441,16 +464,16 @@ def _create_nodes(self, parent, indent=1):
 
             # Handle foreign key references (children, files, tags)
             if kind == content_kinds.TOPIC:
-                self._create_nodes(node, indent=indent + 1)
-            elif kind == content_kinds.EXERCISE:
-                self._create_assessment_items(node, indent=indent + 1)
-            self._create_files(node, indent=indent + 1)
-            self._create_tags(node, indent=indent + 1)
-
-            errors = node.mark_complete()
-            if errors:
-                self.logger.warning(f"Node {node.node_id} has errors: {errors}")
+                self._create_nodes(node, progress)
+            self._create_files(node)
+            self._create_tags(node)
+
+            if kind != content_kinds.EXERCISE:
+                errors = node.mark_complete()
+                if errors:
+                    self.logger.warning(f"Node {node.node_id} has errors: {errors}")
             node.save()
+            progress.update(1)
 
     def _retrieve_license(self, license_id):
         """
@@ -475,12 +498,11 @@ def _retrieve_license(self, license_id):
         ).fetchone()
         return models.License.objects.get(license_name=name), description
 
-    def _create_files(self, contentnode, indent=0):
+    def _create_files(self, contentnode):
         """
         Create and possibly download node files
 
         :param contentnode: node file references
-        :param indent: How far to indent print statements
         """
         # Parse database for files referencing content node and make file models
         sql_command = f"""
@@ -500,11 +522,6 @@ def _create_files(self, contentnode, indent=0):
             is_thumbnail,
         ) in query:
             filename = "{}.{}".format(checksum, extension)
-            self.logger.info(
-                "{indent} * FILE {filename}...".format(
-                    indent="   |" * indent, filename=filename
-                )
-            )
 
             try:
                 self._download_file(
@@ -602,104 +619,128 @@ def _download_file(
         # set_by_file_on_disk: skip unless the file has been downloaded
         file_obj.save(set_by_file_on_disk=file_exists)
 
-    def _create_tags(self, contentnode, indent=0):
+    def _create_tags(self, contentnode):
         """
         Create tags associated with node
 
         :param contentnode: node tags reference
-        :param indent: How far to indent print statements
         """
         # Parse database for files referencing content node and make file models
         sql_command = f"""
-            SELECT ct.id, ct.tag_name
+            SELECT ct.tag_name
             FROM {NODE_TAG_TABLE} cnt
             JOIN {TAG_TABLE} ct ON cnt.contenttag_id = ct.id
             WHERE cnt.contentnode_id = ?;
         """
         query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
 
-        # Build up list of tags
-        tag_list = []
-        for id, tag_name in query:
-            self.logger.info(
-                "{indent} ** TAG {tag}...".format(indent="   |" * indent, tag=tag_name)
-            )
-            # Save values to new or existing tag object
-            tag_obj, is_new = models.ContentTag.objects.get_or_create(
-                pk=id,
-                tag_name=tag_name,
-                channel_id=self.target_id,
-            )
-            tag_list.append(tag_obj)
+        models.ContentTag.objects.bulk_create(
+            [
+                models.ContentTag(
+                    tag_name=tag_name,
+                    channel_id=self.target_id,
+                )
+                for tag_name in query
+            ],
+            ignore_conflicts=True,
+        )
 
         # Save tags to node
-        contentnode.tags.set(tag_list)
+        contentnode.tags.set(
+            models.ContentTag.objects.filter(
+                tag_name__in=query, channel_id=self.target_id
+            )
+        )
         contentnode.save()
 
-    def _create_assessment_items(self, contentnode, indent=0):
+    def _create_assessment_items(self, nodes):
         """
-        Generate assessment items based on perseus zip
+        Generate assessment items based on API data
 
-        :param contentnode: node assessment items reference
-        :param indent: How far to indent print statements
+        :param nodes: nodes to lookup assessment items
         """
+        # Note: there are several different IDs being used within this method
+        node_ids = [node.node_id for node in nodes]
+
         if not self.token:
             self.logger.warning(
-                f"Skipping assessment items for node {contentnode.node_id}"
+                f"Skipping assessment items for node(s) {','. join(node_ids)}"
             )
             return
 
-        # first obtain the content node's Studio ID with the node ID
-        node_response = self.client.get_with_token(
-            f"/api/contentnode?_node_id_channel_id___in={contentnode.node_id},{self.source_id}"
+        # first obtain the remote nodes' IDs with the node ID and channel ID
+        node_channel_ids = f",{self.source_id},".join(node_ids)
+        nodes_response = self.client.get_with_token(
+            f"/api/contentnode?_node_id_channel_id___in={node_channel_ids},{self.source_id}"
         )
-        if node_response.status_code != 200:
+        if nodes_response.status_code != 200:
             self.logger.warning(
-                f"Failed to obtain assessment items for node {contentnode.node_id}"
+                f"Failed to obtain assessment items for node(s) {','. join(node_ids)}"
             )
             return
 
-        node_data = node_response.json()
-        contentnode_id = node_data[0]["id"] if node_data else None
-        if not contentnode_id:
-            self.logger.warning(f"No content node found for node {contentnode.node_id}")
+        nodes_data = nodes_response.json()
+        remote_node_pks = [n["id"] for n in nodes_data] if nodes_data else None
+
+        if not remote_node_pks:
+            self.logger.warning(
+                f"No content node found for node(s) {','. join(node_ids)}"
+            )
             return
 
         # Get the content node's assessment items
         assessment_response = self.client.get_with_token(
-            f"/api/assessmentitem?contentnode__in={contentnode_id}"
+            f"/api/assessmentitem?contentnode__in={','.join(remote_node_pks)}"
         )
         if assessment_response.status_code != 200:
             self.logger.warning(
-                f"Failed to obtain assessment items for node {contentnode.node_id}"
+                f"Failed to obtain assessment items for node(s) {','. join(node_ids)}"
             )
             return
 
         assessment_items = assessment_response.json()
         if not assessment_items:
             self.logger.warning(
-                f"No assessment items found for node {contentnode.node_id}"
+                f"No assessment items found for node(s) {','. join(node_ids)}"
             )
             return
 
-        # Create the assessment items
-        for item in assessment_items:
-            self.logger.info(
-                "{indent} ** ASSESSMENT ITEM {assessment_id}...".format(
-                    indent="   |" * indent, assessment_id=item["assessment_id"]
+        remote_node_pk_map = (
+            {n["node_id"]: n["id"] for n in nodes_data} if nodes_data else {}
+        )
+
+        for local_node in nodes:
+            remote_contentnode_id = remote_node_pk_map.get(local_node.node_id)
+            reduced_assessment_items = [
+                item
+                for item in assessment_items
+                if item["contentnode"] == remote_contentnode_id
+            ]
+
+            if not reduced_assessment_items:
+                self.logger.warning(
+                    f"No assessment items found for node {local_node.node_id}"
                 )
-            )
-            assessment_item = models.AssessmentItem.objects.create(
-                assessment_id=item["assessment_id"],
-                type=item["type"],
-                order=item["order"],
-                question=item["question"],
-                answers=item["answers"],
-                hints=item["hints"],
-                randomize=item.get("randomize", False),
-            )
-            contentnode.assessment_items.add(assessment_item)
-        contentnode.save()
+                continue
+
+            for item in reduced_assessment_items:
+                assessment_item = models.AssessmentItem.objects.create(
+                    assessment_id=item["assessment_id"],
+                    type=item["type"],
+                    order=item["order"],
+                    question=item["question"],
+                    answers=item["answers"],
+                    hints=item["hints"],
+                    raw_data=item["raw_data"],
+                    source_url=item["source_url"],
+                    randomize=item.get("randomize", False),
+                )
+                self._process_assessment_images(assessment_item)
+                local_node.assessment_items.add(assessment_item)
+            errors = local_node.mark_complete()
+            if errors:
+                self.logger.warning(f"Node {local_node.node_id} has errors: {errors}")
+            local_node.save()
 
     def _process_assessment_images(self, assessment_item):
         """
diff --git a/requirements-dev.in b/requirements-dev.in
index bd1d8385e8..02c2458af5 100644
--- a/requirements-dev.in
+++ b/requirements-dev.in
@@ -9,3 +9,4 @@ pre-commit==4.5.0
 nodeenv
 pip-tools==7.5.2
 drf-yasg==1.21.10
+tqdm
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b1442ddbc2..50528b8eb2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -113,6 +113,8 @@ tomli==1.2.3
     #   build
     #   pip-tools
     #   pytest
+tqdm==4.67.1
+    # via -r requirements-dev.in
 typing-extensions==4.15.0
     # via
     #   -c requirements.txt
@@ -123,3 +125,7 @@ virtualenv==20.26.6
     # via pre-commit
 wheel==0.38.1
     # via pip-tools
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools

From 09bb546eb9eea5cd0841b6d43f3f7bd70b0a78c7 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 15:43:31 -0700
Subject: [PATCH 3/7] Local composite storage handling allowing read-only
 access to cloud bucket

---
 .../commands/set_content_mimetypes.py         |  2 +-
 .../contentcuration/production_settings.py    |  2 +-
 .../contentcuration/sandbox_settings.py       |  2 +-
 contentcuration/contentcuration/settings.py   |  2 +-
 .../tests/utils/test_cloud_storage.py         | 10 ---
 .../tests/{ => utils}/test_gcs_storage.py     | 14 ++--
 .../test_storage.py}                          | 12 ++-
 .../contentcuration/utils/cloud_storage.py    | 39 ----------
 .../contentcuration/utils/storage/__init__.py |  0
 .../contentcuration/utils/storage/base.py     | 76 +++++++++++++++++++
 .../{storage_common.py => storage/common.py}  | 17 +++--
 .../contentcuration/utils/storage/dev.py      | 23 ++++++
 .../utils/{gcs_storage.py => storage/gcs.py}  | 66 ++--------------
 .../contentcuration/viewsets/file.py          |  2 +-
 14 files changed, 132 insertions(+), 135 deletions(-)
 delete mode 100644 contentcuration/contentcuration/tests/utils/test_cloud_storage.py
 rename contentcuration/contentcuration/tests/{ => utils}/test_gcs_storage.py (95%)
 rename contentcuration/contentcuration/tests/{test_storage_common.py => utils/test_storage.py} (95%)
 delete mode 100644 contentcuration/contentcuration/utils/cloud_storage.py
 create mode 100644 contentcuration/contentcuration/utils/storage/__init__.py
 create mode 100644 contentcuration/contentcuration/utils/storage/base.py
 rename contentcuration/contentcuration/utils/{storage_common.py => storage/common.py} (94%)
 create mode 100644 contentcuration/contentcuration/utils/storage/dev.py
 rename contentcuration/contentcuration/utils/{gcs_storage.py => storage/gcs.py} (80%)

diff --git a/contentcuration/contentcuration/management/commands/set_content_mimetypes.py b/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
index 27af4732fc..8a79fd02f5 100755
--- a/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
+++ b/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
@@ -14,7 +14,7 @@
 from django.core.files.storage import default_storage
 from django.core.management.base import BaseCommand
 
-from contentcuration.utils.storage_common import determine_content_type
+from contentcuration.utils.storage.common import determine_content_type
 
 
 class Command(BaseCommand):
diff --git a/contentcuration/contentcuration/production_settings.py b/contentcuration/contentcuration/production_settings.py
index a00bf43a41..82319bd85e 100644
--- a/contentcuration/contentcuration/production_settings.py
+++ b/contentcuration/contentcuration/production_settings.py
@@ -10,7 +10,7 @@
 
 MEDIA_ROOT = base_settings.STORAGE_ROOT
 
-DEFAULT_FILE_STORAGE = "contentcuration.utils.gcs_storage.CompositeGCS"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.gcs.CompositeGCS"
 SESSION_ENGINE = "django.contrib.sessions.backends.db"
 
 # email settings
diff --git a/contentcuration/contentcuration/sandbox_settings.py b/contentcuration/contentcuration/sandbox_settings.py
index 61e00a465f..912fed7244 100644
--- a/contentcuration/contentcuration/sandbox_settings.py
+++ b/contentcuration/contentcuration/sandbox_settings.py
@@ -3,7 +3,7 @@
 
 DEBUG = True
 
-DEFAULT_FILE_STORAGE = "contentcuration.utils.gcs_storage.CompositeGCS"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.gcs.CompositeGCS"
 
 LANGUAGES += (("ar", gettext("Arabic")),)  # noqa
 
diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py
index 0f18ed0131..e57064601d 100644
--- a/contentcuration/contentcuration/settings.py
+++ b/contentcuration/contentcuration/settings.py
@@ -357,7 +357,7 @@ def gettext(s):
 ORPHAN_DATE_CLEAN_UP_THRESHOLD = TWO_WEEKS_AGO
 
 # CLOUD STORAGE SETTINGS
-DEFAULT_FILE_STORAGE = "django_s3_storage.storage.S3Storage"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.dev.CompositeStorage"
 AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") or "development"
 AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") or "development"
 AWS_S3_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") or "content"
diff --git a/contentcuration/contentcuration/tests/utils/test_cloud_storage.py b/contentcuration/contentcuration/tests/utils/test_cloud_storage.py
deleted file mode 100644
index 5d84fd9f10..0000000000
--- a/contentcuration/contentcuration/tests/utils/test_cloud_storage.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from django.test import TestCase
-
-from contentcuration.utils.cloud_storage import CloudStorage
-
-
-class CloudStorageTestCase(TestCase):
-    def test_backend_initialization(self):
-        cloud_storage_instance = CloudStorage()
-        self.assertIsNotNone(cloud_storage_instance)
-        self.assertIsInstance(cloud_storage_instance, CloudStorage)
diff --git a/contentcuration/contentcuration/tests/test_gcs_storage.py b/contentcuration/contentcuration/tests/utils/test_gcs_storage.py
similarity index 95%
rename from contentcuration/contentcuration/tests/test_gcs_storage.py
rename to contentcuration/contentcuration/tests/utils/test_gcs_storage.py
index a58420873e..9036641774 100755
--- a/contentcuration/contentcuration/tests/test_gcs_storage.py
+++ b/contentcuration/contentcuration/tests/utils/test_gcs_storage.py
@@ -8,8 +8,8 @@
 from google.cloud.storage.blob import Blob
 from mixer.main import mixer
 
-from contentcuration.utils.gcs_storage import CompositeGCS
-from contentcuration.utils.gcs_storage import GoogleCloudStorage
+from contentcuration.utils.storage.gcs import CompositeGCS
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
 
 
 class GoogleCloudStorageSaveTestCase(TestCase):
@@ -74,9 +74,9 @@ def test_uploads_cache_control_private_if_content_database(self):
         self.storage.save(filename, self.content, blob_object=self.blob_obj)
         assert "private" in self.blob_obj.cache_control
 
-    @mock.patch("contentcuration.utils.gcs_storage.BytesIO")
+    @mock.patch("contentcuration.utils.storage.gcs.BytesIO")
     @mock.patch(
-        "contentcuration.utils.gcs_storage.GoogleCloudStorage._is_file_empty",
+        "contentcuration.utils.storage.gcs.GoogleCloudStorage._is_file_empty",
         return_value=False,
     )
     def test_gzip_if_content_database(self, bytesio_mock, file_empty_mock):
@@ -158,10 +158,10 @@ def setUp(self):
         self.mock_anon_client.get_bucket.return_value = self.mock_anon_bucket
 
         with mock.patch(
-            "contentcuration.utils.gcs_storage._create_default_client",
+            "contentcuration.utils.storage.gcs._create_default_client",
             return_value=self.mock_default_client,
         ), mock.patch(
-            "contentcuration.utils.gcs_storage.Client.create_anonymous_client",
+            "contentcuration.utils.storage.gcs.Client.create_anonymous_client",
             return_value=self.mock_anon_client,
         ):
             self.storage = CompositeGCS()
@@ -192,7 +192,7 @@ def test_open(self):
         self.assertIsInstance(f, File)
         self.mock_default_bucket.get_blob.assert_called_with("blob")
 
-    @mock.patch("contentcuration.utils.gcs_storage.Blob")
+    @mock.patch("contentcuration.utils.storage.gcs.Blob")
     def test_save(self, mock_blob):
         self.storage.save("blob", BytesIO(b"content"))
         blob = mock_blob.return_value
diff --git a/contentcuration/contentcuration/tests/test_storage_common.py b/contentcuration/contentcuration/tests/utils/test_storage.py
similarity index 95%
rename from contentcuration/contentcuration/tests/test_storage_common.py
rename to contentcuration/contentcuration/tests/utils/test_storage.py
index f89534c194..84cb774646 100644
--- a/contentcuration/contentcuration/tests/test_storage_common.py
+++ b/contentcuration/contentcuration/tests/utils/test_storage.py
@@ -10,14 +10,12 @@
 from django_s3_storage.storage import S3Storage
 from mock import MagicMock
 
-from .base import StudioTestCase
+from ..base import StudioTestCase
 from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.storage_common import _get_gcs_presigned_put_url
-from contentcuration.utils.storage_common import determine_content_type
-from contentcuration.utils.storage_common import get_presigned_upload_url
-from contentcuration.utils.storage_common import UnknownStorageBackendError
-
-# The modules we'll test
+from contentcuration.utils.storage.common import _get_gcs_presigned_put_url
+from contentcuration.utils.storage.common import determine_content_type
+from contentcuration.utils.storage.common import get_presigned_upload_url
+from contentcuration.utils.storage.common import UnknownStorageBackendError
 
 
 class MimeTypesTestCase(TestCase):
diff --git a/contentcuration/contentcuration/utils/cloud_storage.py b/contentcuration/contentcuration/utils/cloud_storage.py
deleted file mode 100644
index a331226905..0000000000
--- a/contentcuration/contentcuration/utils/cloud_storage.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from automation.utils.appnexus.base import Backend
-from automation.utils.appnexus.base import BackendFactory
-from automation.utils.appnexus.base import BackendRequest
-from automation.utils.appnexus.base import BackendResponse
-
-
-class CloudStorageBackendRequest(BackendRequest):
-    pass
-
-
-class CloudStorageRequest(CloudStorageBackendRequest):
-    def __init__(self) -> None:
-        super().__init__()
-
-
-class CloudStorageBackendResponse(BackendResponse):
-    pass
-
-
-class CloudStorageResponse(CloudStorageBackendResponse):
-    def __init__(self) -> None:
-        pass
-
-
-class CloudStorageBackendFactory(BackendFactory):
-    def create_backend(self) -> Backend:
-        return super().create_backend()
-
-
-class CloudStorage(Backend):
-    def connect(self) -> None:
-        return super().connect()
-
-    def make_request(self, request) -> CloudStorageResponse:
-        return super().make_request(request)
-
-    @classmethod
-    def _create_instance(cls) -> "CloudStorage":
-        return cls()
diff --git a/contentcuration/contentcuration/utils/storage/__init__.py b/contentcuration/contentcuration/utils/storage/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/contentcuration/contentcuration/utils/storage/base.py b/contentcuration/contentcuration/utils/storage/base.py
new file mode 100644
index 0000000000..4f3d59d635
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/base.py
@@ -0,0 +1,76 @@
+from django.core.files.storage import Storage as BaseStorage
+
+
+class Storage(BaseStorage):
+    def writeable(self):
+        """
+        :rtype: bool
+        """
+        return True
+
+    def get_client(self):
+        """
+        :rtype: object
+        """
+        return None
+
+
+class CompositeStorage(Storage):
+    def __init__(self):
+        self.backends = []
+
+    def _get_writeable_backend(self):
+        """
+        :rtype: Storage
+        """
+        for backend in self.backends:
+            if backend.writeable:
+                return backend
+        raise AssertionError("No writeable backend found")
+
+    def _get_readable_backend(self, name):
+        """
+        :rtype: Storage
+        """
+        for backend in self.backends:
+            if backend.exists(name):
+                return backend
+        raise FileNotFoundError("{} not found".format(name))
+
+    def get_client(self):
+        return self._get_writeable_backend().get_client()
+
+    def open(self, name, mode="rb"):
+        return self._get_readable_backend(name).open(name, mode)
+
+    def save(self, name, content, max_length=None):
+        return self._get_writeable_backend().save(name, content, max_length=max_length)
+
+    def delete(self, name):
+        self._get_writeable_backend().delete(name)
+
+    def exists(self, name):
+        try:
+            self._get_readable_backend(name)
+            return True
+        except FileNotFoundError:
+            return False
+
+    def listdir(self, path):
+        # This method was not implemented on GoogleCloudStorage to begin with
+        raise NotImplementedError("listdir is not implemented for CompositeStorage")
+
+    def size(self, name):
+        return self._get_readable_backend(name).size(name)
+
+    def url(self, name):
+        return self._get_readable_backend(name).url(name)
+
+    def get_accessed_time(self, name):
+        return self._get_readable_backend(name).get_accessed_time(name)
+
+    def get_created_time(self, name):
+        return self._get_readable_backend(name).get_created_time(name)
+
+    def get_modified_time(self, name):
+        return self._get_readable_backend(name).get_modified_time(name)
diff --git a/contentcuration/contentcuration/utils/storage_common.py b/contentcuration/contentcuration/utils/storage/common.py
similarity index 94%
rename from contentcuration/contentcuration/utils/storage_common.py
rename to contentcuration/contentcuration/utils/storage/common.py
index 10d79bd5c5..16cfa5ef8a 100644
--- a/contentcuration/contentcuration/utils/storage_common.py
+++ b/contentcuration/contentcuration/utils/storage/common.py
@@ -6,8 +6,10 @@
 from django.core.files.storage import default_storage
 from django_s3_storage.storage import S3Storage
 
-from .gcs_storage import CompositeGCS
-from .gcs_storage import GoogleCloudStorage
+from .base import CompositeStorage
+from .base import Storage
+from .gcs import CompositeGCS
+from .gcs import GoogleCloudStorage
 
 
 # Do this to ensure that we infer mimetypes for files properly, specifically
@@ -67,15 +69,16 @@ def get_presigned_upload_url(
     # both storage types are having difficulties enforcing it.
 
     mimetype = determine_content_type(filepath)
-    if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
+    bucket = settings.AWS_S3_BUCKET_NAME
+
+    if isinstance(storage, Storage):
         client = client or storage.get_client()
-        bucket = settings.AWS_S3_BUCKET_NAME
+
+    if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
         upload_url = _get_gcs_presigned_put_url(
             client, bucket, filepath, md5sum_b64, lifetime_sec, mimetype=mimetype
         )
-    elif isinstance(storage, S3Storage):
-        bucket = settings.AWS_S3_BUCKET_NAME
-        client = client or storage.s3_connection
+    elif isinstance(storage, (S3Storage, CompositeStorage)):
         upload_url = _get_s3_presigned_put_url(
             client, bucket, filepath, md5sum_b64, lifetime_sec
         )
diff --git a/contentcuration/contentcuration/utils/storage/dev.py b/contentcuration/contentcuration/utils/storage/dev.py
new file mode 100644
index 0000000000..defb7dfaed
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/dev.py
@@ -0,0 +1,23 @@
+from django_s3_storage.storage import S3Storage
+from google.cloud.storage import Client
+
+from contentcuration.utils.storage.base import CompositeStorage as BaseCompositeStorage
+from contentcuration.utils.storage.base import Storage as BaseStorage
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
+
+
+class Storage(S3Storage, BaseStorage):
+    def get_client(self):
+        """
+        :rtype: object
+        """
+        return self.s3_connection
+
+
+class CompositeStorage(BaseCompositeStorage):
+    def __init__(self):
+        super(CompositeStorage, self).__init__()
+        self.backends.append(Storage())
+        self.backends.append(
+            GoogleCloudStorage(Client.create_anonymous_client(), "studio-content")
+        )
diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/storage/gcs.py
similarity index 80%
rename from contentcuration/contentcuration/utils/gcs_storage.py
rename to contentcuration/contentcuration/utils/storage/gcs.py
index 5c4a425aec..95d6b7fec5 100644
--- a/contentcuration/contentcuration/utils/gcs_storage.py
+++ b/contentcuration/contentcuration/utils/storage/gcs.py
@@ -6,11 +6,13 @@
 import backoff
 from django.conf import settings
 from django.core.files import File
-from django.core.files.storage import Storage
 from google.cloud.exceptions import InternalServerError
 from google.cloud.storage import Client
 from google.cloud.storage.blob import Blob
 
+from contentcuration.utils.storage.base import CompositeStorage
+from contentcuration.utils.storage.base import Storage
+
 OLD_STUDIO_STORAGE_PREFIX = "/contentworkshop_content/"
 
 CONTENT_DATABASES_MAX_AGE = 5  # seconds
@@ -122,7 +124,7 @@ def save(self, name, fobj, max_length=None, blob_object=None):
 
         # determine the current file's mimetype based on the name
         # import determine_content_type lazily in here, so we don't get into an infinite loop with circular dependencies
-        from contentcuration.utils.storage_common import determine_content_type
+        from contentcuration.utils.storage.common import determine_content_type
 
         content_type = determine_content_type(name)
 
@@ -216,9 +218,9 @@ def _is_file_empty(fobj):
         return len(byt) == 0
 
 
-class CompositeGCS(Storage):
+class CompositeGCS(CompositeStorage):
     def __init__(self):
-        self.backends = []
+        super(CompositeGCS, self).__init__()
         self.backends.append(
             GoogleCloudStorage(_create_default_client(), settings.AWS_S3_BUCKET_NAME)
         )
@@ -227,59 +229,3 @@ def __init__(self):
             self.backends.append(
                 GoogleCloudStorage(Client.create_anonymous_client(), "studio-content")
             )
-
-    def _get_writeable_backend(self):
-        """
-        :rtype: GoogleCloudStorage
-        """
-        for backend in self.backends:
-            if backend.writeable:
-                return backend
-        raise AssertionError("No writeable backend found")
-
-    def _get_readable_backend(self, name):
-        """
-        :rtype: GoogleCloudStorage
-        """
-        for backend in self.backends:
-            if backend.exists(name):
-                return backend
-        raise FileNotFoundError("{} not found".format(name))
-
-    def get_client(self):
-        return self._get_writeable_backend().get_client()
-
-    def open(self, name, mode="rb"):
-        return self._get_readable_backend(name).open(name, mode)
-
-    def save(self, name, content, max_length=None):
-        return self._get_writeable_backend().save(name, content, max_length=max_length)
-
-    def delete(self, name):
-        self._get_writeable_backend().delete(name)
-
-    def exists(self, name):
-        try:
-            self._get_readable_backend(name)
-            return True
-        except FileNotFoundError:
-            return False
-
-    def listdir(self, path):
-        # This method was not implemented on GoogleCloudStorage to begin with
-        raise NotImplementedError("listdir is not implemented for CompositeGCS")
-
-    def size(self, name):
-        return self._get_readable_backend(name).size(name)
-
-    def url(self, name):
-        return self._get_readable_backend(name).url(name)
-
-    def get_accessed_time(self, name):
-        return self._get_readable_backend(name).get_accessed_time(name)
-
-    def get_created_time(self, name):
-        return self._get_readable_backend(name).get_created_time(name)
-
-    def get_modified_time(self, name):
-        return self._get_readable_backend(name).get_modified_time(name)
diff --git a/contentcuration/contentcuration/viewsets/file.py b/contentcuration/contentcuration/viewsets/file.py
index afadbff0cb..4c2477ad52 100644
--- a/contentcuration/contentcuration/viewsets/file.py
+++ b/contentcuration/contentcuration/viewsets/file.py
@@ -18,7 +18,7 @@
 from contentcuration.models import generate_storage_url
 from contentcuration.utils.cache import ResourceSizeCache
 from contentcuration.utils.sentry import report_exception
-from contentcuration.utils.storage_common import get_presigned_upload_url
+from contentcuration.utils.storage.common import get_presigned_upload_url
 from contentcuration.utils.user import calculate_user_storage
 from contentcuration.viewsets.base import BulkDeleteMixin
 from contentcuration.viewsets.base import BulkListSerializer

From 2450dee6d9324bba9f41b0be9c45cb1dbfc69e47 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 7 May 2025 07:37:38 -0700
Subject: [PATCH 4/7] Consolidate presigned URL handling into storage arch

---
 .../tests/utils/test_storage.py               | 37 +++-----
 .../contentcuration/utils/storage/base.py     | 21 +++++
 .../contentcuration/utils/storage/common.py   | 94 +------------------
 .../contentcuration/utils/storage/dev.py      | 32 ++++++-
 .../contentcuration/utils/storage/gcs.py      | 32 +++++++
 .../contentcuration/viewsets/file.py          |  4 +-
 6 files changed, 103 insertions(+), 117 deletions(-)

diff --git a/contentcuration/contentcuration/tests/utils/test_storage.py b/contentcuration/contentcuration/tests/utils/test_storage.py
index 84cb774646..b4c0e0db20 100644
--- a/contentcuration/contentcuration/tests/utils/test_storage.py
+++ b/contentcuration/contentcuration/tests/utils/test_storage.py
@@ -7,15 +7,15 @@
 import requests
 from django.core.files.storage import FileSystemStorage
 from django.test import TestCase
-from django_s3_storage.storage import S3Storage
 from mock import MagicMock
 
 from ..base import StudioTestCase
 from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.storage.common import _get_gcs_presigned_put_url
 from contentcuration.utils.storage.common import determine_content_type
 from contentcuration.utils.storage.common import get_presigned_upload_url
 from contentcuration.utils.storage.common import UnknownStorageBackendError
+from contentcuration.utils.storage.dev import Storage as DevStorage
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
 
 
 class MimeTypesTestCase(TestCase):
@@ -79,7 +79,6 @@ def test_raises_error(self):
                 "nice",
                 "err",
                 5,
-                0,
                 storage=self.STORAGE,
             )
 
@@ -93,7 +92,9 @@ class GoogleCloudStoragePresignedURLUnitTestCase(TestCase):
     """
 
     def setUp(self):
+        super().setUp()
         self.client = MagicMock()
+        self.storage = GoogleCloudStorage(self.client, "fake")
         self.generate_signed_url_method = (
             self.client.get_bucket.return_value.blob.return_value.generate_signed_url
         )
@@ -105,19 +106,15 @@ def test_that_generate_signed_url_is_called(self):
         """
         Check that we even call blob.generate_signed_url in the first place.
         """
-        bucket = "fake"
-        _get_gcs_presigned_put_url(self.client, bucket, "/object.jpg", "aBc", 0, 0)
+        get_presigned_upload_url("/object.jpg", "aBc", 0, storage=self.storage)
         self.generate_signed_url_method.assert_called_once()
 
     def test_that_we_return_a_string(self):
         """
         Check that _get_gcs_presigned_put_url returns a string.
         """
-        bucket = "fake"
-        ret = _get_gcs_presigned_put_url(
-            self.client, bucket, "/object.jpg", "aBc", 0, 0
-        )
-        assert isinstance(ret, str)
+        ret = get_presigned_upload_url("/object.jpg", "aBc", 0, storage=self.storage)
+        assert isinstance(ret["uploadURL"], str)
 
     def test_generate_signed_url_called_with_required_arguments(self):
         """
@@ -135,11 +132,9 @@ def test_generate_signed_url_called_with_required_arguments(self):
         bucket_name = "fake"
         filepath = "object.jpg"
         lifetime = 20  # seconds
-        mimetype = "doesntmatter"
+        mimetype = "image/jpeg"
 
-        _get_gcs_presigned_put_url(
-            self.client, bucket_name, filepath, content_md5, lifetime, mimetype
-        )
+        get_presigned_upload_url(filepath, content_md5, lifetime, storage=self.storage)
 
         # assert that we're creating the right object
         self.client.get_bucket.assert_called_once_with(bucket_name)
@@ -151,8 +146,8 @@ def test_generate_signed_url_called_with_required_arguments(self):
         self.generate_signed_url_method.assert_called_once_with(
             method=method,
             content_md5=content_md5,
-            expiration=lifetime_timedelta,
             content_type=mimetype,
+            expiration=lifetime_timedelta,
         )
 
 
@@ -161,11 +156,9 @@ class S3StoragePresignedURLUnitTestCase(StudioTestCase):
     Test cases for generating presigned URLs for S3 storage, i.e. Minio.
     """
 
-    STORAGE = S3Storage()
-
     def setUp(self):
-        self.client = MagicMock()
         super().setUp()
+        self.storage = DevStorage()
 
     def test_returns_string_if_inputs_are_valid(self):
         """
@@ -174,9 +167,7 @@ def test_returns_string_if_inputs_are_valid(self):
         """
 
         # use a real connection here as a sanity check
-        ret = get_presigned_upload_url(
-            "a/b/abc.jpg", "aBc", 10, 1, storage=self.STORAGE, client=None
-        )
+        ret = get_presigned_upload_url("a/b/abc.jpg", "aBc", 10, storage=self.storage)
         url = ret["uploadURL"]
 
         assert isinstance(url, str)
@@ -197,9 +188,7 @@ def test_can_upload_file_to_presigned_url(self):
         filename = "blahfile.jpg"
         filepath = generate_object_storage_name(md5_checksum, filename)
 
-        ret = get_presigned_upload_url(
-            filepath, md5_checksum_base64, 1000, len(file_contents)
-        )
+        ret = get_presigned_upload_url(filepath, md5_checksum_base64, 1000)
         url = ret["uploadURL"]
         content_type = ret["mimetype"]
 
diff --git a/contentcuration/contentcuration/utils/storage/base.py b/contentcuration/contentcuration/utils/storage/base.py
index 4f3d59d635..a78e54153f 100644
--- a/contentcuration/contentcuration/utils/storage/base.py
+++ b/contentcuration/contentcuration/utils/storage/base.py
@@ -14,6 +14,20 @@ def get_client(self):
         """
         return None
 
+    def get_presigned_put_url(
+        self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+    ):
+        """
+        Creates a pre-signed URL for uploading files.
+
+        :param filepath: A string representing the destination file path inside the bucket
+        :param md5sum: A MD5 checksum of the file to be uploaded
+        :param lifetime_sec: The lifetime of the URL in seconds
+        :param mimetype: The content type of the file to be uploaded
+        :return: A pre-signed URL for uploading the file
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
 
 class CompositeStorage(Storage):
     def __init__(self):
@@ -74,3 +88,10 @@ def get_created_time(self, name):
 
     def get_modified_time(self, name):
         return self._get_readable_backend(name).get_modified_time(name)
+
+    def get_presigned_put_url(
+        self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+    ):
+        return self._get_writeable_backend().get_presigned_put_url(
+            filepath, md5sum, lifetime_sec, mimetype=mimetype
+        )
diff --git a/contentcuration/contentcuration/utils/storage/common.py b/contentcuration/contentcuration/utils/storage/common.py
index 16cfa5ef8a..6a40768720 100644
--- a/contentcuration/contentcuration/utils/storage/common.py
+++ b/contentcuration/contentcuration/utils/storage/common.py
@@ -1,15 +1,10 @@
 import mimetypes
 import os
-from datetime import timedelta
 
-from django.conf import settings
 from django.core.files.storage import default_storage
-from django_s3_storage.storage import S3Storage
 
 from .base import CompositeStorage
 from .base import Storage
-from .gcs import CompositeGCS
-from .gcs import GoogleCloudStorage
 
 
 # Do this to ensure that we infer mimetypes for files properly, specifically
@@ -41,11 +36,10 @@ def get_presigned_upload_url(
     filepath,
     md5sum_b64,
     lifetime_sec,
-    content_length,
     storage=default_storage,
-    client=None,
 ):
-    """Return a presigned URL that can modify the given filepath through a PUT
+    """
+    Return a presigned URL that can modify the given filepath through a PUT
     request. Performing a PUT request on the returned URL changes the object's
     contents with the contents of your PUT request.
 
@@ -54,9 +48,6 @@ def get_presigned_upload_url(
     have to set a Content-MD5 HTTP header matching this md5sum once it
     initiates the download.
     :param: lifetime_sec: the lifetime of the generated upload url, in seconds.
-    :param: content_length: the size of the content, in bytes.
-    :param: client: the storage client that will be used to gennerate the presigned URL.
-    This must have an API that's similar to either the GCS client or the boto3 client.
 
     :returns: a dictionary containing 2 keys:
         mimetype: the mimetype that will be required to send as part of the file upload's mimetype header
@@ -64,23 +55,11 @@ def get_presigned_upload_url(
 
     :raises: :class:`UnknownStorageBackendError`: If the storage backend is not S3 or GCS.
     """
-
-    # Aron: note that content_length is not used right now because
-    # both storage types are having difficulties enforcing it.
-
     mimetype = determine_content_type(filepath)
-    bucket = settings.AWS_S3_BUCKET_NAME
 
-    if isinstance(storage, Storage):
-        client = client or storage.get_client()
-
-    if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
-        upload_url = _get_gcs_presigned_put_url(
-            client, bucket, filepath, md5sum_b64, lifetime_sec, mimetype=mimetype
-        )
-    elif isinstance(storage, (S3Storage, CompositeStorage)):
-        upload_url = _get_s3_presigned_put_url(
-            client, bucket, filepath, md5sum_b64, lifetime_sec
+    if isinstance(storage, (Storage, CompositeStorage)):
+        upload_url = storage.get_presigned_put_url(
+            filepath, md5sum_b64, lifetime_sec, mimetype=mimetype
         )
     else:
         raise UnknownStorageBackendError(
@@ -88,66 +67,3 @@ def get_presigned_upload_url(
         )
 
     return {"mimetype": mimetype, "uploadURL": upload_url}
-
-
-def _get_gcs_presigned_put_url(
-    gcs_client,
-    bucket,
-    filepath,
-    md5sum,
-    lifetime_sec,
-    mimetype="application/octet-stream",
-):
-    bucket_obj = gcs_client.get_bucket(bucket)
-    blob_obj = bucket_obj.blob(filepath)
-
-    # ensure the md5sum doesn't have any whitespace, including newlines.
-    # We should do the same whitespace stripping as well on any client that actually
-    # uses the returned presigned url.
-    md5sum_stripped = md5sum.strip()
-
-    # convert the lifetime to a timedelta, so gcloud library will interpret the lifetime
-    # as the seconds from right now. If we use an absolute integer, it's the number of seconds
-    # from unix time
-    lifetime_timedelta = timedelta(seconds=lifetime_sec)
-
-    url = blob_obj.generate_signed_url(
-        method="PUT",
-        content_md5=md5sum_stripped,
-        content_type=mimetype,
-        expiration=lifetime_timedelta,
-    )
-
-    return url
-
-
-def _get_s3_presigned_put_url(s3_client, bucket, filepath, md5sum, lifetime_sec):
-    """
-    Creates a pre-signed URL for S3-like backends, e.g. Minio.
-
-    Note that since our production object storage backend is GCS, we do not enforce or require
-    any Content-MD5 value.
-
-    :param: s3_client: an initialized S3 client. We will use this to create the presigned PUT url.
-    :param: bucket: the bucket where the user can PUT their object.
-    :param: filepath: the file path inside the bucket that the user can PUT their object.
-    :param: md5sum: the base64-encoded MD5sum of the object the user is planning to PUT.
-        This is ignored for this function and added solely to maintain API compatibility with other
-        private presigned URL functions.
-    :param: lifetime_sec: how long before the presigned URL expires, in seconds.
-    """
-    # S3's PUT Object parameters:
-    # https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html
-    method = "put_object"
-    fields = {
-        "Bucket": bucket,
-        "Key": filepath,
-    }
-
-    response = s3_client.generate_presigned_url(
-        ClientMethod=method,
-        Params=fields,
-        ExpiresIn=lifetime_sec,
-    )
-
-    return response
diff --git a/contentcuration/contentcuration/utils/storage/dev.py b/contentcuration/contentcuration/utils/storage/dev.py
index defb7dfaed..7e77a6e305 100644
--- a/contentcuration/contentcuration/utils/storage/dev.py
+++ b/contentcuration/contentcuration/utils/storage/dev.py
@@ -1,3 +1,4 @@
+from django.conf import settings
 from django_s3_storage.storage import S3Storage
 from google.cloud.storage import Client
 
@@ -9,10 +10,39 @@
 class Storage(S3Storage, BaseStorage):
     def get_client(self):
         """
-        :rtype: object
+        :rtype: botocore.client.BaseClient
         """
         return self.s3_connection
 
+    def get_presigned_put_url(self, filepath, md5sum, lifetime_sec, mimetype=None):
+        """
+        Creates a pre-signed URL for development storage backends
+
+        Note that since our production object storage backend is GCS, we do not enforce or require
+        any Content-MD5 value.
+
+        :param: filepath: the file path inside the bucket that the user can PUT their object.
+        :param: md5sum: the base64-encoded MD5sum of the object the user is planning to PUT.
+            This is ignored for this function and added solely to maintain API compatibility with other
+            private presigned URL functions.
+        :param: lifetime_sec: how long before the presigned URL expires, in seconds.
+        :param: mimetype: the content type of the file to be uploaded
+        :return: A pre-signed URL for uploading the file
+        """
+        # S3's PUT Object parameters:
+        # https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html
+        method = "put_object"
+        fields = {
+            "Bucket": settings.AWS_S3_BUCKET_NAME,
+            "Key": filepath,
+        }
+
+        return self.get_client().generate_presigned_url(
+            ClientMethod=method,
+            Params=fields,
+            ExpiresIn=lifetime_sec,
+        )
+
 
 class CompositeStorage(BaseCompositeStorage):
     def __init__(self):
diff --git a/contentcuration/contentcuration/utils/storage/gcs.py b/contentcuration/contentcuration/utils/storage/gcs.py
index 95d6b7fec5..38e0347d2d 100644
--- a/contentcuration/contentcuration/utils/storage/gcs.py
+++ b/contentcuration/contentcuration/utils/storage/gcs.py
@@ -1,5 +1,6 @@
 import logging
 import tempfile
+from datetime import timedelta
 from gzip import GzipFile
 from io import BytesIO
 
@@ -217,6 +218,37 @@ def _is_file_empty(fobj):
             fobj.seek(current_location)
         return len(byt) == 0
 
+    def get_presigned_put_url(
+        self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+    ):
+        """
+        Creates a pre-signed URL for GCS.
+
+        :param filepath: A string representing the destination file path inside the bucket
+        :param md5sum: A MD5 checksum of the file to be uploaded
+        :param lifetime_sec: The lifetime of the URL in seconds
+        :param mimetype: The content type of the file to be uploaded
+        :return: A pre-signed URL for uploading the file
+        """
+        blob_obj = self.bucket.blob(filepath)
+
+        # ensure the md5sum doesn't have any whitespace, including newlines.
+        # We should do the same whitespace stripping as well on any client that actually
+        # uses the returned presigned url.
+        md5sum_stripped = md5sum.strip()
+
+        # convert the lifetime to a timedelta, so gcloud library will interpret the lifetime
+        # as the seconds from right now. If we use an absolute integer, it's the number of seconds
+        # from unix time
+        lifetime_timedelta = timedelta(seconds=lifetime_sec)
+
+        return blob_obj.generate_signed_url(
+            method="PUT",
+            content_md5=md5sum_stripped,
+            content_type=mimetype,
+            expiration=lifetime_timedelta,
+        )
+
 
 class CompositeGCS(CompositeStorage):
     def __init__(self):
diff --git a/contentcuration/contentcuration/viewsets/file.py b/contentcuration/contentcuration/viewsets/file.py
index 4c2477ad52..f2e3444686 100644
--- a/contentcuration/contentcuration/viewsets/file.py
+++ b/contentcuration/contentcuration/viewsets/file.py
@@ -252,9 +252,7 @@ def upload_url(self, request):
         checksum_base64 = codecs.encode(
             codecs.decode(checksum, "hex"), "base64"
         ).decode()
-        retval = get_presigned_upload_url(
-            filepath, checksum_base64, 600, content_length=size
-        )
+        retval = get_presigned_upload_url(filepath, checksum_base64, 600)
 
         file = File(
             file_size=size,

From a75233bc9f503e853a750ca54ccbceb43b6bd184 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 7 May 2025 08:50:42 -0700
Subject: [PATCH 5/7] Speed up restoration by avoiding read-only GCS storage

---
 .../contentcuration/utils/import_tools.py         | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index c0d75207c5..58461ee750 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -32,6 +32,7 @@
 from contentcuration.utils.files import write_base64_to_file
 from contentcuration.utils.garbage_collect import get_deleted_chefs_root
 from contentcuration.utils.publish import publish_channel
+from contentcuration.utils.storage.base import CompositeStorage
 from contentcuration.viewsets.assessmentitem import exercise_image_filename_regex
 
 
@@ -60,6 +61,11 @@ def __init__(self, base_url, api_token=None):
         super(ImportClient, self).__init__()
         self.base_url = base_url
         self.api_token = api_token
+        self.headers.update(
+            {
+                "User-Agent": f"restore_channel/kolibri-studio/dev python-requests/{requests.__version__}",
+            }
+        )
 
     def __getattr__(self, name):
         if name.endswith("_with_token"):
@@ -180,6 +186,11 @@ def __init__(
         self.download_content = download_content
         self.logger = logger or logging.getLogger(__name__)
         self.client = ImportClient(source_url, api_token=token)
+        self.storage = (
+            default_storage._get_writeable_backend()
+            if isinstance(default_storage, CompositeStorage)
+            else default_storage
+        )
         self.conn = None
         self.cursor = None
         self.progress = None
@@ -569,8 +580,8 @@ def _download_file(
         file_exists = False
 
         # If the file already exists, get the size from the storage
-        if default_storage.exists(filepath):
-            file_size = file_size or default_storage.size(filepath)
+        if self.storage.exists(filepath):
+            file_size = file_size or self.storage.size(filepath)
             file_exists = True
         # if it needs downloading and if we were instructed to do so
         elif self.download_content or (is_thumbnail and contentnode):

From 71a85bc1efad36d5a61f886114d3e0ba1a8730a5 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 7 May 2025 11:45:21 -0700
Subject: [PATCH 6/7] More defensive and thorough handling of completion and
 mastery criteria

---
 .../contentcuration/utils/import_tools.py     | 101 +++++++++++-------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index 58461ee750..bc6e58baa1 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -277,9 +277,9 @@ def run(self):
                     unit="node",
                 )
                 chunk = []
-                for node in exercise_nodes.iterator(chunk_size=10):
+                for node in exercise_nodes.iterator(chunk_size=20):
                     chunk.append(node)
-                    if len(chunk) >= 10:
+                    if len(chunk) >= 20:
                         self._create_assessment_items(chunk)
                         exercise_progress.update(len(chunk))
                         chunk = []
@@ -405,44 +405,11 @@ def _create_nodes(self, parent, progress):
             if coach_content:
                 role = roles.COACH
 
-            # Determine extra_fields
-            extra_fields = {}
-            if kind == content_kinds.EXERCISE:
-                randomize_sql = f"""
-                    SELECT randomize
-                    FROM {ASSESSMENTMETADATA_TABLE}
-                    WHERE contentnode_id = ?
-                """
-                randomize = self.cursor.execute(randomize_sql, (id,)).fetchone()
-                extra_fields["options"] = json.loads(options) if options else {}
-                extra_fields["randomize"] = bool(randomize[0]) if randomize else False
-                completion_criteria_ = extra_fields["options"].get(
-                    "completion_criteria"
-                )
-                if (
-                    completion_criteria_
-                    and completion_criteria_.get("model") == completion_criteria.MASTERY
-                ):
-                    mastery_model = completion_criteria_.get("threshold", {}).get(
-                        "mastery_model"
-                    )
-                    if mastery_model == mastery_criteria.DO_ALL:
-                        completion_criteria_["threshold"] = {
-                            "mastery_model": mastery_model,
-                        }
-                if (
-                    completion_criteria_
-                    and "learner_managed" not in completion_criteria_
-                ):
-                    completion_criteria_["learner_managed"] = False
-
             # Determine license
             license_result = self._retrieve_license(license_id)
             license_description = license_result[1] if license_result else ""
             license_result = license_result[0] if license_result else None
 
-            # TODO: Determine thumbnail encoding
-
             # Create the new node model
             node = models.ContentNode.objects.create(
                 node_id=id,
@@ -458,7 +425,7 @@ def _create_nodes(self, parent, progress):
                 license_description=license_description,
                 language_id=lang_id,
                 role_visibility=role,
-                extra_fields=extra_fields,
+                extra_fields=self._prepare_node_extra_fields(id, kind, options),
                 kind_id=kind,
                 parent=parent,
                 original_channel_id=self.target_id,
@@ -479,6 +446,8 @@ def _create_nodes(self, parent, progress):
             self._create_files(node)
             self._create_tags(node)
 
+            # assessments are handled after all nodes are created, which also ensures nodes
+            # are marked complete
             if kind != content_kinds.EXERCISE:
                 errors = node.mark_complete()
                 if errors:
@@ -486,6 +455,64 @@ def _create_nodes(self, parent, progress):
             node.save()
             progress.update(1)
 
+    def _prepare_node_extra_fields(self, node_id, kind, options):
+        """
+        Prepare extra fields for the node based on the kind and options. For exercises, it
+        retrieves the additional info from the assessment metadata.
+
+        :param node_id: the node ID
+        :param kind: the content kind
+        :param options: the options JSON string
+        :return: a dictionary of extra fields
+        """
+        extra_fields = {
+            "options": json.loads(options) if options else {},
+        }
+        completion_criteria_ = extra_fields["options"].get("completion_criteria", {})
+
+        # don't fill anything in if there is no completion_criteria, otherwise validation will fail
+        if completion_criteria_ and "learner_managed" not in completion_criteria_:
+            completion_criteria_.update(learner_managed=False)
+
+        if kind == content_kinds.EXERCISE:
+            randomize_sql = f"""
+                SELECT randomize, mastery_model
+                FROM {ASSESSMENTMETADATA_TABLE}
+                WHERE contentnode_id = ?
+            """
+            randomize, mastery_criteria_ = self.cursor.execute(
+                randomize_sql, (node_id,)
+            ).fetchone()
+            extra_fields["randomize"] = bool(randomize) if randomize else False
+            if mastery_criteria_:
+                mastery_criteria_ = json.loads(mastery_criteria_)
+                mastery_criteria_.update(mastery_model=mastery_criteria_.pop("type"))
+                completion_criteria_.update(
+                    {
+                        "model": completion_criteria.MASTERY,
+                        "threshold": mastery_criteria_,
+                    }
+                )
+
+        if completion_criteria_.get("model") == completion_criteria.MASTERY:
+            mastery_model = completion_criteria_.get("threshold", {}).get(
+                "mastery_model"
+            )
+            if mastery_model in [
+                mastery_criteria.DO_ALL,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_2,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_3,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_5,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_10,
+            ]:
+                # remove m,n values
+                completion_criteria_["threshold"] = {
+                    "mastery_model": mastery_model,
+                }
+
+        extra_fields["options"].update(completion_criteria=completion_criteria_)
+        return extra_fields
+
     def _retrieve_license(self, license_id):
         """
         Get license based on id from exported db
@@ -544,7 +571,7 @@ def _create_files(self, contentnode):
                     is_thumbnail=is_thumbnail,
                 )
             except IOError as e:
-                self.logger.warning("\b FAILED (check logs for more details)")
+                self.logger.warning(f"FAILED to download '{filename}': {str(e)}")
                 if e.errno:
                     sys.stderr.write(
                         f"Restoration Process Error: Failed to save file object {filename}: {os.strerror(e.errno)}"

From 6b0a37fcfddbfde69d96e8464b04cdc0edafa136 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 17 Dec 2025 13:19:41 -0800
Subject: [PATCH 7/7] Change python server port and add nginx to services for
 proxying

---
 .run/devserver.run.xml                    |  2 +-
 Makefile                                  |  2 +-
 contentcuration/contentcuration/models.py | 43 +++--------------------
 docker-compose.yml                        | 10 +++---
 package.json                              |  2 +-
 webpack.config.js                         | 40 +++++++++++++--------
 6 files changed, 36 insertions(+), 63 deletions(-)

diff --git a/.run/devserver.run.xml b/.run/devserver.run.xml
index 1c94ee6402..55b6546404 100644
--- a/.run/devserver.run.xml
+++ b/.run/devserver.run.xml
@@ -13,7 +13,7 @@
     <option name="ADD_SOURCE_ROOTS" value="false" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
     <option name="SCRIPT_NAME" value="contentcuration/manage.py" />
-    <option name="PARAMETERS" value="runserver --settings=contentcuration.dev_settings 0.0.0.0:8080" />
+    <option name="PARAMETERS" value="runserver --settings=contentcuration.dev_settings 0.0.0.0:8081" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />
diff --git a/Makefile b/Makefile
index 002d337323..6eebcf6d54 100644
--- a/Makefile
+++ b/Makefile
@@ -200,7 +200,7 @@ dctest: .docker/minio .docker/postgres
 
 dcservicesup: .docker/minio .docker/postgres
 	# launch all studio's dependent services using docker-compose
-	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.alt.yml up minio postgres redis
+	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.alt.yml up minio postgres redis studio-nginx
 
 dcservicesdown:
 	# stop services that were started using dcservicesup
diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py
index 32727f0159..8aefab24ee 100644
--- a/contentcuration/contentcuration/models.py
+++ b/contentcuration/contentcuration/models.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import os
-import urllib.parse
 import uuid
 from datetime import datetime
 
@@ -671,44 +670,10 @@ def generate_storage_url(filename, request=None, *args):
 
     path = generate_object_storage_name(os.path.splitext(filename)[0], filename)
 
-    # There are three scenarios where Studio might be run as:
-    #
-    # 1. In normal kubernetes, nginx will proxy for us. We'll know we're in kubernetes when the
-    # environment variable RUN_MODE=k8s
-    #
-    # 2. In Docker Compose and bare metal runserver, we'll be running in runserver, and minio
-    # will be exposed in port 9000 in the host's localhost network.
-
-    # Note (aron): returning the true storage URL (e.g. https://storage.googleapis.com/storage/a.mp4)
-    # isn't too important, because we have CDN in front of our servers, so it should be cached.
-    # But change the logic here in case there is a potential for bandwidth and latency improvement.
-
-    # Detect our current state first
-    run_mode = os.getenv("RUN_MODE")
-
-    # if we're running inside k8s, then just serve the normal /content/{storage,databases} URL,
-    # and let nginx handle proper proxying.
-    if run_mode == "k8s":
-        url = "/content/{path}".format(
-            path=path,
-        )
-
-    # if we're in docker-compose or in baremetal, just return the object storage URL as localhost:9000
-    elif run_mode == "docker-compose" or run_mode is None:
-        # generate the minio storage URL, so we can get the GET parameters that give everyone
-        # access even if they don't need to log in
-        params = urllib.parse.urlparse(default_storage.url(path)).query
-        host = "localhost"
-        port = 9000  # hardcoded to the default minio IP address
-        url = "http://{host}:{port}/{bucket}/{path}?{params}".format(
-            host=host,
-            port=port,
-            bucket=settings.AWS_S3_BUCKET_NAME,
-            path=path,
-            params=params,
-        )
-
-    return url
+    # requires that we always have a proxy of /content to storage bucket, handled by nginx in dev
+    return "/content/{path}".format(
+        path=path,
+    )
 
 
 class FileOnDiskStorage(FileSystemStorage):
diff --git a/docker-compose.yml b/docker-compose.yml
index 3a07894c8d..b6d0b0069a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,7 +14,8 @@ x-studio-environment:
     CELERY_BROKER_ENDPOINT: redis
     CELERY_RESULT_BACKEND_ENDPOINT: redis
     CELERY_REDIS_PASSWORD: ""
-    PROBER_STUDIO_BASE_URL: http://studio-app:8080/{path}
+    PROBER_STUDIO_BASE_URL: http://studio-app:8081/{path}
+    WEBPACK_DEV_HOST: 0.0.0.0
 
 x-studio-worker:
   &studio-worker
@@ -36,10 +37,7 @@ services:
     build:
       context: .
       dockerfile: k8s/images/nginx/Dockerfile
-    ports:
-      - "8081:8080"
-    depends_on:
-      - studio-app
+    network_mode: host
     environment: *studio-environment
 
   studio-app:
@@ -47,7 +45,7 @@ services:
     entrypoint: python docker/entrypoint.py
     command: pnpm run devserver
     ports:
-      - "8080:8080"
+      - "8081:8081"
       - "4000:4000"
 
   celery-worker:
diff --git a/package.json b/package.json
index 7f5764f22d..bf86ea58c8 100644
--- a/package.json
+++ b/package.json
@@ -25,7 +25,7 @@
     "test-jest": "pnpm run test",
     "test-jest:debug": "node --inspect node_modules/.bin/jest --runInBand --watch",
     "minio": "MINIO_API_CORS_ALLOW_ORIGIN='http://localhost:8080,http://127.0.0.1:8080' MINIO_ACCESS_KEY=development MINIO_SECRET_KEY=development minio server ~/.minio_data/ || true",
-    "runserver": "cd contentcuration && python manage.py runserver --settings=contentcuration.dev_settings 0.0.0.0:8080",
+    "runserver": "cd contentcuration && python manage.py runserver --settings=contentcuration.dev_settings 0.0.0.0:8081",
     "devserver": "npm-run-all --parallel build:dev runserver",
     "devserver:hot": "npm-run-all --parallel build:dev:hot runserver",
     "devserver-hot": "pnpm run devserver:hot",
diff --git a/webpack.config.js b/webpack.config.js
index d055936b79..7da3a2dd61 100644
--- a/webpack.config.js
+++ b/webpack.config.js
@@ -16,7 +16,12 @@ const WebpackRTLPlugin = require('kolibri-tools/lib/webpackRtlPlugin');
 
 const { InjectManifest } = require('workbox-webpack-plugin');
 
-// Function to detect if running in WSL
+const DEFAULT_WEBPACK_DEV_HOST = '127.0.0.1';
+
+/**
+ * Function to detect if running in WSL
+ * @return {boolean}
+ */
 function isWSL() {
   try {
     const version = fs.readFileSync('/proc/version', 'utf8');
@@ -26,14 +31,24 @@ function isWSL() {
   }
 }
 
-// Function to get WSL IP address
-function getWSLIP() {
+/**
+ * Get the host for the webpack dev server.
+ * @return {string}
+ */
+function getWebpackDevHost() {
+  if (process.env.WEBPACK_DEV_HOST) {
+    return process.env.WEBPACK_DEV_HOST;
+  }
+
+  if (!isWSL()) {
+    return DEFAULT_WEBPACK_DEV_HOST;
+  }
+
   try {
-    const ip = execSync('hostname -I').toString().trim().split(' ')[0];
-    return ip;
+    return execSync('hostname -I').toString().trim().split(' ')[0];
   } catch (err) {
     console.warn('Failed to get WSL IP address:', err);
-    return '127.0.0.1';
+    return DEFAULT_WEBPACK_DEV_HOST;
   }
 }
 
@@ -60,11 +75,8 @@ module.exports = (env = {}) => {
   const pnpmNodeModules = path.join(rootDir, 'node_modules', '.pnpm', 'node_modules');
 
   // Determine the appropriate dev server host and public path based on environment
-  const isWSLEnvironment = isWSL();
-  const devServerHost = isWSLEnvironment ? '0.0.0.0' : '127.0.0.1';
-  const devPublicPath = isWSLEnvironment ?
-    `http://${getWSLIP()}:4000/dist/` :
-    'http://127.0.0.1:4000/dist/';
+  const devServerHost = getWebpackDevHost();
+  const devPublicPath = `http://${devServerHost}:4000/dist/`;
 
   const workboxPlugin = new InjectManifest({
     swSrc: path.resolve(srcDir, 'serviceWorker/index.js'),
@@ -120,10 +132,8 @@ module.exports = (env = {}) => {
       allowedHosts: [
         '127.0.0.1',
         'localhost',
-      ].concat(
-        // For WSL, allow the WSL IP address
-        isWSLEnvironment ? [getWSLIP()] : []
-      ),
+        getWebpackDevHost(),
+      ]
     },
     module: {
       rules: [