From d9e638e0eb2070d5135ef5622e113fa4ad719783 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 13:03:14 -0700
Subject: [PATCH 1/8] Refactor restore-channel utilities

---
 .../management/commands/restore_channel.py    |   64 +-
 .../tests/test_restore_channel.py             |  179 ---
 .../contentcuration/utils/files.py            |   23 +-
 .../contentcuration/utils/import_tools.py     | 1075 ++++++++++-------
 4 files changed, 709 insertions(+), 632 deletions(-)
 delete mode 100644 contentcuration/contentcuration/tests/test_restore_channel.py

diff --git a/contentcuration/contentcuration/management/commands/restore_channel.py b/contentcuration/contentcuration/management/commands/restore_channel.py
index efaeb3ee7c..49b785f725 100644
--- a/contentcuration/contentcuration/management/commands/restore_channel.py
+++ b/contentcuration/contentcuration/management/commands/restore_channel.py
@@ -2,28 +2,66 @@
 
 from django.core.management.base import BaseCommand
 
-from contentcuration.utils.import_tools import import_channel
+from contentcuration.utils.import_tools import ImportManager
 
-logger = logging.getLogger('command')
+logger = logging.getLogger("command")
 
 
 class Command(BaseCommand):
+    """
+    This command is used to restore a channel from another Studio instance. This is for
+    development purposes only and should not be used in production.
+    """
 
     def add_arguments(self, parser):
         # ID of channel to read data from
-        parser.add_argument('source_id', type=str)
+        parser.add_argument("source_id", type=str)
 
         # ID of channel to write data to (can be same as source channel)
-        parser.add_argument('--target', help='restore channel db to TARGET CHANNEL ID')
-        parser.add_argument('--download-url', help='where to download db from')
-        parser.add_argument('--editor', help='add user as editor to channel')
+        parser.add_argument(
+            "--target",
+            help="A different channel ID for which to restore the channel. If not provided, the source channel ID will be used.",
+        )
+        parser.add_argument(
+            "--source-url",
+            default="http://localhost:8080",
+            help="Studio instance from which to download the channel DB or content files",
+        )
+        parser.add_argument("--token", help="API token for the Studio instance")
+        parser.add_argument(
+            "--editor",
+            default="a@a.com",
+            help="Add user as editor to channel with provided email address",
+        )
+        parser.add_argument(
+            "--download-content",
+            action="store_true",
+            default=False,
+            help="Whether to download content files",
+        )
+        parser.add_argument(
+            "--public",
+            action="store_true",
+            default=False,
+            help="Whether to make the channel public",
+        )
+        parser.add_argument(
+            "--publish",
+            action="store_true",
+            default=False,
+            help="Whether to publish the channel after restoration",
+        )
 
     def handle(self, *args, **options):
-        # Set up variables for restoration process
         logger.info("\n\n********** STARTING CHANNEL RESTORATION **********")
-        source_id = options['source_id']
-        target_id = options.get('target') or source_id
-        download_url = options.get('download_url')
-        editor = options.get('editor')
-
-        import_channel(source_id, target_id, download_url, editor, logger=logger)
+        manager = ImportManager(
+            options["source_url"],
+            options["source_id"],
+            target_id=options.get("target"),
+            editor=options.get("editor"),
+            public=options.get("public"),
+            publish=options.get("publish"),
+            token=options.get("token"),
+            download_content=options.get("download_content"),
+        )
+        manager.run()
diff --git a/contentcuration/contentcuration/tests/test_restore_channel.py b/contentcuration/contentcuration/tests/test_restore_channel.py
deleted file mode 100644
index a4d1e13a39..0000000000
--- a/contentcuration/contentcuration/tests/test_restore_channel.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# -*- coding: utf-8 -*-
-import datetime
-import json
-import uuid
-from io import BytesIO
-
-from django.core.files.storage import default_storage
-from django.template.loader import render_to_string
-from django.utils.translation import activate
-from django.utils.translation import deactivate
-from le_utils.constants import exercises
-from mixer.backend.django import mixer
-from mock import MagicMock
-from mock import patch
-
-from .base import StudioTestCase
-from contentcuration.models import AssessmentItem
-from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.import_tools import create_channel
-from contentcuration.utils.import_tools import generate_assessment_item
-from contentcuration.utils.import_tools import process_content
-
-
-thumbnail_path = "/content/thumbnail.png"
-ASSESSMENT_DATA = {
-    'input-question-test': {
-        'template': 'perseus/input_question.json',
-        'type': exercises.INPUT_QUESTION,
-        'question': "Input question",
-        'question_images': [{"name": "test.jpg", "width": 12.71, "height": 12.12}],
-        'hints': [{'hint': 'Hint 1'}],
-        'answers': [
-            {'answer': '1', 'correct': True, 'images': []},
-            {'answer': '2', 'correct': True, 'images': []}
-        ],
-        'order': 0
-    },
-    'multiple-selection-test': {
-        'template': 'perseus/multiple_selection.json',
-        'type': exercises.MULTIPLE_SELECTION,
-        'question': "Multiple selection question",
-        'question_images': [],
-        'hints': [],
-        'answers': [
-            {'answer': 'A', 'correct': True, 'images': []},
-            {'answer': 'B', 'correct': True, 'images': []},
-            {'answer': 'C', 'correct': False, 'images': []},
-        ],
-        'multiple_select': True,
-        'order': 1,
-        'randomize': False
-    },
-    'single-selection-test': {
-        'template': 'perseus/multiple_selection.json',
-        'type': exercises.SINGLE_SELECTION,
-        'question': "Single select question",
-        'question_images': [],
-        'hints': [{'hint': 'Hint test'}],
-        'answers': [
-            {'answer': 'Correct answer', 'correct': True, 'images': []},
-            {'answer': 'Incorrect answer', 'correct': False, 'images': []},
-        ],
-        'multiple_select': False,
-        'order': 2,
-        'randomize': True
-    },
-    'perseus-question-test': {
-        'template': 'perseus/perseus_question.json',
-        'type': exercises.PERSEUS_QUESTION,
-        'order': 3,
-        'raw_data': '{}'
-    }
-}
-
-
-class ChannelRestoreUtilityFunctionTestCase(StudioTestCase):
-    @patch("contentcuration.utils.import_tools.write_to_thumbnail_file", return_value=thumbnail_path)
-    def setUp(self, thumb_mock):
-        self.id = uuid.uuid4().hex
-        self.name = "test name"
-        self.description = "test description"
-        self.thumbnail_encoding = "base64 string"
-        self.root_pk = uuid.uuid4()
-        self.version = 7
-        self.last_updated = datetime.datetime.now()
-        self.cursor_mock = MagicMock()
-        self.cursor_mock.execute.return_value.fetchone.return_value = (
-            self.id,
-            self.name,
-            self.description,
-            self.thumbnail_encoding,
-            self.root_pk,
-            self.version,
-            self.last_updated,
-        )
-        self.channel, _ = create_channel(self.cursor_mock, self.id, self.admin_user)
-
-    def test_restore_channel_id(self):
-        self.assertEqual(self.channel.id, self.id)
-
-    def test_restore_channel_name(self):
-        self.assertEqual(self.channel.name, self.name)
-
-    def test_restore_channel_description(self):
-        self.assertEqual(self.channel.description, self.description)
-
-    def test_restore_channel_thumbnail(self):
-        self.assertEqual(self.channel.thumbnail, thumbnail_path)
-
-    def test_restore_channel_thumbnail_encoding(self):
-        self.assertEqual(self.channel.thumbnail_encoding["base64"], self.thumbnail_encoding)
-
-    def test_restore_channel_version(self):
-        self.assertEqual(self.channel.version, self.version)
-
-
-class PerseusRestoreTestCase(StudioTestCase):
-    def setUp(self):
-        super(PerseusRestoreTestCase, self).setUp()
-        image_path = generate_object_storage_name('test', 'test.png')
-        default_storage.save(image_path, BytesIO(b'test'))
-
-    def test_process_content(self):
-        tests = [
-            {
-                "content": 'test 1',
-                "output": 'test 1',
-                'images': {}
-            },
-            {
-                "content": 'test 2 ![test](${☣ LOCALPATH}/images/test.png)',
-                "output": 'test 2 ![test](${☣ CONTENTSTORAGE}/test.png)',
-                'images': {}
-            },
-            {
-                "content": 'test 3 ![](${☣ LOCALPATH}/images/test.png)',
-                "output": 'test 3 ![](${☣ CONTENTSTORAGE}/test.png =50x50)',
-                'images': {
-                    '${☣ LOCALPATH}/images/test.png': {
-                        'width': 50,
-                        'height': 50
-                    }
-                }
-            },
-            {
-                "content": 'test 4 ![](${☣ LOCALPATH}/images/test.png) ![](${☣ LOCALPATH}/images/test.png)',
-                "output": 'test 4 ![](${☣ CONTENTSTORAGE}/test.png) ![](${☣ CONTENTSTORAGE}/test.png)',
-                'images': {}
-            },
-            {
-                "content": 'test 5  $\\sqrt{36}+\\frac{1}{2}$ ',
-                "output": 'test 5 $$\\sqrt{36}+\\frac{1}{2}$$',
-                'images': {}
-            },
-            {
-                "content": 'test 6 $\\frac{1}{2}$ $\\frac{3}{2}$',
-                "output": 'test 6 $$\\frac{1}{2}$$ $$\\frac{3}{2}$$',
-                'images': {}
-            }
-        ]
-        for test in tests:
-            result = process_content(test, mixer.blend(AssessmentItem))
-            self.assertEqual(result, test['output'])
-
-    def test_generate_assessment_item(self):
-        # Run in Spanish to ensure we are properly creating JSON with non-localized numbers
-        activate("es-es")
-        for assessment_id, data in list(ASSESSMENT_DATA.items()):
-            assessment_data = json.loads(render_to_string(data['template'], data).encode('utf-8', "ignore"))
-            assessment_item = generate_assessment_item(assessment_id, data['order'], data['type'], assessment_data)
-            self.assertEqual(assessment_item.type, data['type'])
-            self.assertEqual(assessment_item.question, data.get('question', ''))
-            self.assertEqual(assessment_item.randomize, bool(data.get('randomize')))
-            self.assertEqual(assessment_item.raw_data, data.get('raw_data', ''))
-            for hint in json.loads(assessment_item.hints):
-                self.assertTrue(any(h for h in data['hints'] if h['hint'] == hint['hint']))
-            for answer in json.loads(assessment_item.answers):
-                self.assertTrue(any(a for a in data['answers'] if a['answer'] == str(answer['answer']) and a['correct'] == answer['correct']))
-        deactivate()
diff --git a/contentcuration/contentcuration/utils/files.py b/contentcuration/contentcuration/utils/files.py
index a5d8361e8c..74c53f8ba9 100644
--- a/contentcuration/contentcuration/utils/files.py
+++ b/contentcuration/contentcuration/utils/files.py
@@ -79,12 +79,13 @@ def duplicate_file(file_object, node=None, assessment_item=None, preset_id=None,
     return file_copy
 
 
-def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
+def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH, input_buffer=None):
     """
         Generates a base64 encoding for a thumbnail
         Args:
             filename (str): thumbnail to generate encoding from (must be in storage already)
             dimension (int, optional): desired width of thumbnail. Defaults to 400.
+            input_buffer (BytesIO, optional): buffer to read from. Defaults to None.
         Returns base64 encoding of resized thumbnail
     """
 
@@ -97,17 +98,17 @@ def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
     # make sure the aspect ratio between width and height is 16:9
     thumbnail_size = [dimension, round(dimension / 1.77)]
     try:
-        if not filename.startswith(settings.STATIC_ROOT):
-            filename = generate_object_storage_name(checksum, filename)
-            inbuffer = default_storage.open(filename, 'rb')
-
-        else:
-            inbuffer = open(filename, 'rb')
-
-        if not inbuffer:
+        if not input_buffer:
+            if not filename.startswith(settings.STATIC_ROOT):
+                filename = generate_object_storage_name(checksum, filename)
+                input_buffer = default_storage.open(filename, 'rb')
+            else:
+                input_buffer = open(filename, 'rb')
+
+        if not input_buffer:
             raise AssertionError
 
-        with Image.open(inbuffer) as image:
+        with Image.open(input_buffer) as image:
             image_format = image.format
 
             # Note: Image.thumbnail ensures that the image will fit in the
@@ -122,7 +123,7 @@ def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
     finally:
         # Try to close the inbuffer if it has been created
         try:
-            inbuffer.close()
+            input_buffer.close()
         except UnboundLocalError:
             pass
         outbuffer.close()
diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index e662b75fc4..875291d84b 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -4,498 +4,715 @@
 import logging
 import os
 import re
-import shutil
 import sqlite3
 import sys
 import tempfile
-import zipfile
+from functools import cached_property
 from io import BytesIO
 
 import requests
-from django.conf import settings
 from django.core.files.storage import default_storage
+from django.core.management import call_command
 from django.db import transaction
+from kolibri_content.router import get_active_content_database
+from kolibri_content.router import using_content_database
+from le_utils.constants import completion_criteria
 from le_utils.constants import content_kinds
 from le_utils.constants import exercises
 from le_utils.constants import format_presets
+from le_utils.constants import mastery_criteria
 from le_utils.constants import roles
+from le_utils.constants.labels import learning_activities
 
 from contentcuration import models
 from contentcuration.api import write_raw_content_to_storage
 from contentcuration.utils.files import create_file_from_contents
+from contentcuration.utils.files import get_thumbnail_encoding
 from contentcuration.utils.files import write_base64_to_file
 from contentcuration.utils.garbage_collect import get_deleted_chefs_root
-
-
-CHANNEL_TABLE = 'content_channelmetadata'
-NODE_TABLE = 'content_contentnode'
-ASSESSMENTMETADATA_TABLE = 'content_assessmentmetadata'
-FILE_TABLE = 'content_file'
-TAG_TABLE = 'content_contenttag'
-NODE_TAG_TABLE = 'content_contentnode_tags'
-LICENSE_TABLE = 'content_license'
+from contentcuration.utils.publish import publish_channel
+from contentcuration.viewsets.assessmentitem import exercise_image_filename_regex
+
+CHANNEL_TABLE = "content_channelmetadata"
+NODE_TABLE = "content_contentnode"
+ASSESSMENTMETADATA_TABLE = "content_assessmentmetadata"
+FILE_TABLE = "content_file"
+TAG_TABLE = "content_contenttag"
+NODE_TAG_TABLE = "content_contentnode_tags"
+LICENSE_TABLE = "content_license"
 NODE_COUNT = 0
 FILE_COUNT = 0
 TAG_COUNT = 0
 
 ANSWER_FIELD_MAP = {
-    exercises.SINGLE_SELECTION: 'radio 1',
-    exercises.MULTIPLE_SELECTION: 'radio 1',
-    exercises.INPUT_QUESTION: 'numeric-input 1',
+    exercises.SINGLE_SELECTION: "radio 1",
+    exercises.MULTIPLE_SELECTION: "radio 1",
+    exercises.INPUT_QUESTION: "numeric-input 1",
 }
 
 log = logging.getLogger(__name__)
 
 
-def import_channel(source_id, target_id=None, download_url=None, editor=None, logger=None):
-    """
-    Import a channel from another Studio instance. This can be used to
-    copy online Studio channels into local machines for development,
-    testing, faster editing, or other purposes.
-
-    :param source_id: The UUID of the channel to import from the source Studio instance.
-    :param target_id: The UUID of the channel on the local instance. Defaults to source_id.
-    :param download_url: The URL of the Studio instance to import from.
-    :param editor: The email address of the user you wish to add as an editor, if any.
-
-    """
-
-    global log
-    if logger:
-        log = logger
-    else:
-        log = logging.getLogger(__name__)
-
-    # Set up variables for the import process
-    log.info("\n\n********** STARTING CHANNEL IMPORT **********")
-    start = datetime.datetime.now()
-    target_id = target_id or source_id
-
-    # Test connection to database
-    log.info("Connecting to database for channel {}...".format(source_id))
-
-    tempf = tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False)
-    conn = None
-    try:
-        if download_url:
-            response = requests.get('{}/content/databases/{}.sqlite3'.format(download_url, source_id))
-            for chunk in response:
-                tempf.write(chunk)
-        else:
-            filepath = "/".join([settings.DB_ROOT, "{}.sqlite3".format(source_id)])
-            # Check if database exists
-            if not default_storage.exists(filepath):
-                raise IOError("The object requested does not exist.")
-            with default_storage.open(filepath) as fobj:
-                shutil.copyfileobj(fobj, tempf)
-
-        tempf.close()
-        conn = sqlite3.connect(tempf.name)
-        cursor = conn.cursor()
-
-        # Start by creating channel
-        log.info("Creating channel...")
-        editor = models.User.objects.get(email=editor)
-        channel, root_pk = create_channel(conn, target_id, editor)
-        channel.editors.add(editor)
-        channel.save()
-
-        # Create root node
-        root = models.ContentNode.objects.create(
-            node_id=root_pk,
-            title=channel.name,
-            kind_id=content_kinds.TOPIC,
-            original_channel_id=target_id,
-            source_channel_id=target_id,
+class ImportClient(requests.Session):
+    def __init__(self, base_url, api_token=None):
+        super(ImportClient, self).__init__()
+        self.base_url = base_url
+        self.api_token = api_token
+
+    def __getattr__(self, name):
+        if name.endswith("_with_token"):
+            if not self.api_token:
+                raise ValueError("API token is required for this method.")
+
+            target_method = getattr(
+                super(ImportClient, self), name.replace("_with_token", "")
+            )
+            token_headers = {
+                "Authorization": f"Token {self.api_token}",
+            }
+            return lambda url, *args, **kwargs: target_method(
+                url, *args, headers=token_headers, **kwargs
+            )
+        raise AttributeError(
+            f"'{self.__class__.__name__}' object has no attribute '{name}'"
         )
 
-        # Create nodes mapping to channel
-        log.info("   Creating nodes...")
-        with transaction.atomic():
-            create_nodes(cursor, target_id, root, download_url=download_url)
-            # TODO: Handle prerequisites
-
-        # Delete the previous tree if it exists
-        old_previous = channel.previous_tree
-        if old_previous:
-            old_previous.parent = get_deleted_chefs_root()
-            old_previous.title = "Old previous tree for channel {}".format(channel.pk)
-            old_previous.save()
-
-        # Save tree to target tree
-        channel.previous_tree = channel.main_tree
-        channel.main_tree = root
-        channel.save()
-    finally:
-        conn and conn.close()
-        tempf.close()
-        os.unlink(tempf.name)
-
-    # Print stats
-    log.info("\n\nChannel has been imported (time: {ms})\n".format(ms=datetime.datetime.now() - start))
-    log.info("\n\n********** IMPORT COMPLETE **********\n\n")
-
-
-def create_channel(cursor, target_id, editor):
-    """ create_channel: Create channel at target id
-        Args:
-            cursor (sqlite3.Connection): connection to export database
-            target_id (str): channel_id to write to
-        Returns: channel model created and id of root node
-    """
-    id, name, description, thumbnail, root_pk, version, last_updated = cursor.execute(
-        'SELECT id, name, description, thumbnail, root_pk, version, last_updated FROM {table}'
-        .format(table=CHANNEL_TABLE)).fetchone()
-    channel, is_new = models.Channel.objects.get_or_create(pk=target_id, actor_id=editor.id)
-    channel.name = name
-    channel.description = description
-    channel.thumbnail = write_to_thumbnail_file(thumbnail)
-    channel.thumbnail_encoding = {'base64': thumbnail, 'points': [], 'zoom': 0}
-    channel.version = version
-    channel.save()
-    log.info("\tCreated channel {} with name {}".format(target_id, name))
-    return channel, root_pk
+    def request(self, method, url, *args, **kwargs):
+        url = f"{self.base_url}{url}"
+        return super(ImportClient, self).request(method, url, *args, **kwargs)
 
 
 def write_to_thumbnail_file(raw_thumbnail):
-    """ write_to_thumbnail_file: Convert base64 thumbnail to file
-        Args:
-            raw_thumbnail (str): base64 encoded thumbnail
-        Returns: thumbnail filename
+    """write_to_thumbnail_file: Convert base64 thumbnail to file
+    Args:
+        raw_thumbnail (str): base64 encoded thumbnail
+    Returns: thumbnail filename
     """
-    if raw_thumbnail and isinstance(raw_thumbnail, str) and raw_thumbnail != "" and 'static' not in raw_thumbnail:
+    if (
+        raw_thumbnail
+        and isinstance(raw_thumbnail, str)
+        and raw_thumbnail != ""
+        and "static" not in raw_thumbnail
+    ):
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tempf:
             try:
                 tempf.close()
                 write_base64_to_file(raw_thumbnail, tempf.name)
-                with open(tempf.name, 'rb') as tf:
-                    fobj = create_file_from_contents(tf.read(), ext="png", preset_id=format_presets.CHANNEL_THUMBNAIL)
+                with open(tempf.name, "rb") as tf:
+                    fobj = create_file_from_contents(
+                        tf.read(), ext="png", preset_id=format_presets.CHANNEL_THUMBNAIL
+                    )
                     return str(fobj)
             finally:
                 tempf.close()
                 os.unlink(tempf.name)
 
 
-def create_nodes(cursor, target_id, parent, indent=1, download_url=None):
-    """ create_channel: Create channel at target id
-        Args:
-            cursor (sqlite3.Connection): connection to export database
-            target_id (str): channel_id to write to
-            parent (models.ContentNode): node's parent
-            indent (int): How far to indent print statements
-        Returns: newly created node
+def convert_metadata_to_dict(metadata):
     """
-    # Read database rows that match parent
-    parent_query = "parent_id=\'{}\'".format(parent.node_id)
-
-    sql_command = 'SELECT id, title, content_id, description, sort_order, '\
-        'license_owner, author, license_id, kind, coach_content, lang_id FROM {table} WHERE {query} ORDER BY sort_order;'\
-        .format(table=NODE_TABLE, query=parent_query)
-    query = cursor.execute(sql_command).fetchall()
-
-    # Parse through rows and create models
-    for id, title, content_id, description, sort_order, license_owner, author, license_id, kind, coach_content, lang_id in query:
-        log.info("{indent} {id} ({title} - {kind})...".format(indent="   |" * indent, id=id, title=title, kind=kind))
-
-        # Determine role
-        role = roles.LEARNER
-        if coach_content:
-            role = roles.COACH
-
-        # Determine extra_fields
-        assessment_query = "SELECT mastery_model, randomize FROM {table} WHERE contentnode_id='{node}'".format(table=ASSESSMENTMETADATA_TABLE, node=id)
-        result = cursor.execute(assessment_query).fetchone()
-        extra_fields = result[0] if result else {}
-        if isinstance(extra_fields, str):
-            extra_fields = json.loads(extra_fields)
-        if result:
-            extra_fields.update({"randomize": result[1]})
-
-        # Determine license
-        license = retrieve_license(cursor, license_id)
-        license_description = license[1] if license else ""
-        license = license[0] if license else None
-
-        # TODO: Determine thumbnail encoding
-
-        # Create new node model
-        node = models.ContentNode.objects.create(
-            node_id=id,
-            original_source_node_id=id,
-            source_node_id=id,
-            title=title,
-            content_id=content_id,
-            description=description,
-            sort_order=sort_order,
-            copyright_holder=license_owner,
-            author=author,
-            license=license,
-            license_description=license_description,
-            language_id=lang_id,
-            role_visibility=role,
-            extra_fields=extra_fields,
-            kind_id=kind,
-            parent=parent,
-            original_channel_id=target_id,
-            source_channel_id=target_id,
-        )
-
-        # Handle foreign key references (children, files, tags)
-        if kind == content_kinds.TOPIC:
-            create_nodes(cursor, target_id, node, indent=indent + 1, download_url=download_url)
-        elif kind == content_kinds.EXERCISE:
-            create_assessment_items(cursor, node, indent=indent + 1, download_url=download_url)
-        create_files(cursor, node, indent=indent + 1, download_url=download_url)
-        create_tags(cursor, node, target_id, indent=indent + 1)
+    Convert metadata from a string to a dictionary.
 
-    return node
+    :param metadata: The metadata string to convert.
+    :return: A dictionary representation of the metadata.
+    """
+    if isinstance(metadata, str):
+        metadata_split = metadata.split(",")
+        return {metadata_key: True for metadata_key in metadata_split}
+    return metadata
 
 
-def retrieve_license(cursor, license_id):
-    """ retrieve_license_name: Get license based on id from exported db
-        Args:
-            cursor (sqlite3.Connection): connection to export database
-            license_id (str): id of license on exported db
-        Returns: license model matching the name and the associated license description
+def convert_learning_activities_to_dict(content_kind, metadata):
     """
-    # Handle no license being assigned
-    if license_id is None or license_id == "":
-        return None
+    Convert learning activities from a string to a dictionary.
 
-    # Return license that matches name
-    name, description = cursor.execute(
-        'SELECT license_name, license_description FROM {table} WHERE id={id}'.format(table=LICENSE_TABLE, id=license_id)
-    ).fetchone()
-    return models.License.objects.get(license_name=name), description
-
-
-def download_file(filename, download_url=None, contentnode=None, assessment_item=None, preset=None, file_size=None, lang_id=None):
-    checksum, extension = os.path.splitext(filename)
-    extension = extension.lstrip('.')
-    filepath = models.generate_object_storage_name(checksum, filename)
-
-    # Download file if it hasn't already been downloaded
-    if download_url and not default_storage.exists(filepath):
-        buffer = BytesIO()
-        response = requests.get('{}/content/storage/{}/{}/{}'.format(download_url, filename[0], filename[1], filename))
-        for chunk in response:
-            buffer.write(chunk)
-
-        checksum, _, filepath = write_raw_content_to_storage(buffer.getvalue(), ext=extension)
-        buffer.close()
-
-    # Save values to new file object
-    file_obj = models.File(
-        file_format_id=extension,
-        file_size=file_size or default_storage.size(filepath),
-        contentnode=contentnode,
-        assessment_item=assessment_item,
-        language_id=lang_id,
-        preset_id=preset or "",
-    )
-    file_obj.file_on_disk.name = filepath
-    file_obj.save()
-
-
-def create_files(cursor, contentnode, indent=0, download_url=None):
-    """ create_files: Get license
-        Args:
-            cursor (sqlite3.Connection): connection to export database
-            contentnode (models.ContentNode): node file references
-            indent (int): How far to indent print statements
-        Returns: None
+    :param content_kind: The content kind of the learning activities.
+    :param metadata: The learning activities string to convert.
+    :return: A dictionary representation of the learning activities.
     """
-    # Parse database for files referencing content node and make file models
-    sql_command = 'SELECT checksum, extension, file_size, contentnode_id, '\
-        'lang_id, preset FROM {table} WHERE contentnode_id=\'{id}\';'\
-        .format(table=FILE_TABLE, id=contentnode.node_id)
+    metadata = convert_metadata_to_dict(metadata)
+    if isinstance(metadata, dict):
+        return metadata
+
+    if content_kind == content_kinds.EXERCISE:
+        return {learning_activities.PRACTICE: True}
+    elif content_kind in [content_kinds.HTML5, content_kinds.H5P]:
+        return {learning_activities.EXPLORE: True}
+    elif content_kind == content_kinds.AUDIO:
+        return {learning_activities.LISTEN: True}
+    elif content_kind == content_kinds.VIDEO:
+        return {learning_activities.WATCH: True}
+    elif content_kind == content_kinds.DOCUMENT:
+        return {learning_activities.READ: True}
+    elif content_kind == content_kinds.SLIDESHOW:
+        return {learning_activities.READ: True}
+    elif content_kind == content_kinds.TOPIC:
+        return None
+    return {learning_activities.EXPLORE: True}
 
-    query = cursor.execute(sql_command).fetchall()
-    for checksum, extension, file_size, contentnode_id, lang_id, preset in query:
-        filename = "{}.{}".format(checksum, extension)
-        log.info("{indent} * FILE {filename}...".format(indent="   |" * indent, filename=filename))
 
-        try:
-            download_file(filename, download_url=download_url, contentnode=contentnode, preset=preset, file_size=file_size, lang_id=lang_id)
-
-        except IOError as e:
-            log.warning("\b FAILED (check logs for more details)")
-            sys.stderr.write("Restoration Process Error: Failed to save file object {}: {}".format(filename, os.strerror(e.errno)))
-            continue
-
-
-def create_tags(cursor, contentnode, target_id, indent=0):
-    """ create_tags: Create tags associated with node
-        Args:
-            cursor (sqlite3.Connection): connection to export database
-            contentnode (models.ContentNode): node file references
-            target_id (str): channel_id to write to
-            indent (int): How far to indent print statements
-        Returns: None
+class ImportManager(object):
     """
-    # Parse database for files referencing content node and make file models
-    sql_command = 'SELECT ct.id, ct.tag_name FROM {cnttable} cnt '\
-        'JOIN {cttable} ct ON cnt.contenttag_id = ct.id ' \
-        'WHERE cnt.contentnode_id=\'{id}\';'\
-        .format(
-            cnttable=NODE_TAG_TABLE,
-            cttable=TAG_TABLE,
-            id=contentnode.node_id,
-        )
-    query = cursor.execute(sql_command).fetchall()
-
-    # Build up list of tags
-    tag_list = []
-    for id, tag_name in query:
-        log.info("{indent} ** TAG {tag}...".format(indent="   |" * indent, tag=tag_name))
-        # Save values to new or existing tag object
-        tag_obj, is_new = models.ContentTag.objects.get_or_create(
-            pk=id,
-            tag_name=tag_name,
-            channel_id=target_id,
-        )
-        tag_list.append(tag_obj)
-
-    # Save tags to node
-    contentnode.tags.set(tag_list)
-    contentnode.save()
-
-
-def create_assessment_items(cursor, contentnode, indent=0, download_url=None):
-    """ create_assessment_items: Generate assessment items based on perseus zip
-        Args:
-            cursor (sqlite3.Connection): connection to export database
-            contentnode (models.ContentNode): node assessment items reference
-            indent (int): How far to indent print statements
-            download_url (str): Domain to download files from
-        Returns: None
+    Import a channel from another Studio instance. This can be used to copy online Studio channels
+    into local machines for development, testing, faster editing, or other purposes.
     """
 
-    # Parse database for files referencing content node and make file models
-    sql_command = 'SELECT checksum, extension '\
-        'preset FROM {table} WHERE contentnode_id=\'{id}\' AND preset=\'exercise\';'\
-        .format(table=FILE_TABLE, id=contentnode.node_id)
-
-    query = cursor.execute(sql_command).fetchall()
-    for checksum, extension in query:
-        filename = "{}.{}".format(checksum, extension)
-        log.info("{indent} * EXERCISE {filename}...".format(indent="   |" * indent, filename=filename))
-
+    def __init__(
+        self,
+        source_url,
+        source_id,
+        target_id=None,
+        editor=None,
+        public=False,
+        publish=False,
+        token=None,
+        download_content=True,
+        logger=None,
+    ):
+        self.source_id = source_id
+        self.target_id = target_id or source_id
+        self.source_url = source_url
+        self.editor = editor
+        self.public = public
+        self.publish = publish
+        self.token = token
+        self.download_content = download_content
+        self.logger = logger or logging.getLogger(__name__)
+        self.client = ImportClient(source_url, api_token=token)
+        self.conn = None
+        self.cursor = None
+        self.schema_version = None
+
+    @cached_property
+    def editor_user(self):
+        """
+        Get the User object for the editor email address.
+
+        :return: The User object for the editor.
+        """
+        return models.User.objects.get(email=self.editor) if self.editor else None
+
+    def run(self):
+        """
+        Run the import process.
+        """
+        # Set up variables for the import process
+        self.logger.info("\n\n********** STARTING CHANNEL IMPORT **********")
+        start = datetime.datetime.now()
+
+        if not self.token:
+            self.logger.warning(
+                "No API token provided. This may result in limited functionality."
+            )
+
+        # Test connection to the database
+        self.logger.info(f"Connecting to database for channel {self.source_id}...")
+
+        tempf = tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False)
         try:
-            # Store the downloaded zip into temporary storage
-            tempf = tempfile.NamedTemporaryFile(suffix='.{}'.format(extension), delete=False)
-            response = requests.get('{}/content/storage/{}/{}/{}'.format(download_url, filename[0], filename[1], filename))
+            response = self.client.get(f"/content/databases/{self.source_id}.sqlite3")
             for chunk in response:
                 tempf.write(chunk)
-            tempf.close()
-            extract_assessment_items(tempf.name, contentnode, download_url=download_url)
-        except IOError as e:
-            log.warning("\b FAILED (check logs for more details)")
-            sys.stderr.write("Restoration Process Error: Failed to save file object {}: {}".format(filename, os.strerror(e.errno)))
-            continue
-        finally:
-            os.unlink(tempf.name)
 
+            tempf.close()
 
-def extract_assessment_items(filepath, contentnode, download_url=None):
-    """ extract_assessment_items: Create and save assessment items to content node
-        Args:
-            filepath (str): Where perseus zip is stored
-            contentnode (models.ContentNode): node assessment items reference
-            download_url (str): Domain to download files from
-        Returns: None
-    """
-
-    try:
-        tempdir = tempfile.mkdtemp()
-        with zipfile.ZipFile(filepath, 'r') as zipf:
-            zipf.extractall(tempdir)
-        os.chdir(tempdir)
-
-        with open('exercise.json', 'rb') as fobj:
-            data = json.load(fobj)
-
-        for index, assessment_id in enumerate(data['all_assessment_items']):
-            with open('{}.json'.format(assessment_id), 'rb') as fobj:
-                assessment_item = generate_assessment_item(
-                    assessment_id,
-                    index,
-                    data['assessment_mapping'][assessment_id],
-                    json.load(fobj),
-                    download_url=download_url
+            with using_content_database(tempf.name):
+                call_command(
+                    "migrate",
+                    "content",
+                    database=get_active_content_database(),
+                    no_input=True,
                 )
-                contentnode.assessment_items.add(assessment_item)
-    finally:
-        shutil.rmtree(tempdir)
-
-
-def generate_assessment_item(assessment_id, order, assessment_type, assessment_data, download_url=None):
-    """ generate_assessment_item: Generates a new assessment item
-        Args:
-            assessment_id (str): AssessmentItem.assessment_id value
-            order (Number): AssessmentItem.order value
-            assessment_type (str): AssessmentItem.type value
-            assessment_data (dict): Extracted data from perseus file
-            download_url (str): Domain to download files from
-        Returns: models.AssessmentItem
-    """
-    assessment_item = models.AssessmentItem.objects.create(
-        assessment_id=assessment_id,
-        type=assessment_type,
-        order=order
-    )
-    if assessment_type == exercises.PERSEUS_QUESTION:
-        assessment_item.raw_data = json.dumps(assessment_data)
-    else:
-        # Parse questions
-        assessment_data['question']['content'] = '\n\n'.join(assessment_data['question']['content'].split('\n\n')[:-1])
-        assessment_item.question = process_content(assessment_data['question'], assessment_item, download_url=download_url)
-
-        # Parse answers
-        answer_data = assessment_data['question']['widgets'][ANSWER_FIELD_MAP[assessment_type]]['options']
-        if assessment_type == exercises.INPUT_QUESTION:
-            assessment_item.answers = json.dumps([
-                {'answer': answer['value'], 'correct': True} for answer in answer_data['answers']
-            ])
-        else:
-            assessment_item.answers = json.dumps([
-                {'answer': process_content(answer, assessment_item, download_url=download_url), 'correct': answer['correct']}
-                for answer in answer_data['choices']
-            ])
-            assessment_item.randomize = answer_data['randomize']
-
-        # Parse hints
-        assessment_item.hints = json.dumps([
-            {'hint': process_content(hint, assessment_item, download_url=download_url)}
-            for hint in assessment_data['hints']
-        ])
-
-    assessment_item.save()
-    return assessment_item
-
-
-def process_content(data, assessment_item, download_url=None):
-    """ process_content: Parses perseus text for special formatting (e.g. formulas, images)
-        Args:
-            data (dict): Perseus data to parse (e.g. parsing 'question' field)
-            download_url (str): Domain to download files from
-            assessment_item (models.AssessmentItem): assessment item to save images to
-        Returns: models.AssessmentItem
-    """
-    data['content'] = data['content'].replace(' ', '')  # Remove unrecognized non unicode characters
-    # Process formulas
-    for match in re.finditer(r'(\$[^\$☣]+\$)', data['content']):
-        data['content'] = data['content'].replace(match.group(0), '${}$'.format(match.group(0)))
 
-    # Process images
+            self.conn = sqlite3.connect(tempf.name)
+            self.cursor = self.conn.cursor()
+
+            # Start by creating the channel
+            self.logger.info("Creating channel...")
+            channel, root_pk = self._create_channel()
+            channel.editors.add(self.editor_user)
+            channel.save()
+
+            # Create the root node
+            root = models.ContentNode.objects.create(
+                node_id=root_pk,
+                title=channel.name,
+                kind_id=content_kinds.TOPIC,
+                original_channel_id=self.target_id,
+                source_channel_id=self.target_id,
+                complete=True,
+            )
+
+            # Create nodes mapping to channel
+            self.logger.info("   Creating nodes...")
+            with transaction.atomic():
+                self._create_nodes(root)
+                # TODO: Handle prerequisites
+
+            # Delete the previous tree if it exists
+            old_previous = channel.previous_tree
+            if old_previous:
+                old_previous.parent = get_deleted_chefs_root()
+                old_previous.title = f"Old previous tree for channel {channel.pk}"
+                old_previous.save()
+
+            # Save the new tree to the target tree, and preserve the old one
+            channel.previous_tree = channel.main_tree
+            channel.main_tree = root
+            channel.save()
+        finally:
+            self.conn and self.conn.close()
+            tempf.close()
+            os.unlink(tempf.name)
 
-    for match in re.finditer(r'!\[[^\]]*\]\((\$(\{☣ LOCALPATH\}\/images)\/([^\.]+\.[^\)]+))\)', data['content']):
-        data['content'] = data['content'].replace(match.group(2), exercises.CONTENT_STORAGE_PLACEHOLDER)
-        image_data = data['images'].get(match.group(1))
-        if image_data and image_data.get('width'):
-            data['content'] = data['content'].replace(match.group(3), '{} ={}x{}'.format(match.group(3), image_data['width'], image_data['height']))
+        # Publish the channel if requested
+        if self.publish:
+            self.logger.info("Publishing channel...")
+            publish_channel(self.editor_user.id, channel.id)
 
-        # Save files to db
-        download_file(match.group(3), assessment_item=assessment_item, preset=format_presets.EXERCISE, download_url=download_url)
+        # Print stats
+        self.logger.info(
+            f"\n\nChannel has been imported (time: {datetime.datetime.now() - start})\n"
+        )
+        self.logger.info("\n\n********** IMPORT COMPLETE **********\n\n")
+
+    def _create_channel(self):
+        """
+        Create the channel at target id
+        """
+        (
+            id,
+            name,
+            description,
+            thumbnail,
+            root_pk,
+            version,
+            last_updated,
+            schema_version,
+        ) = self.cursor.execute(
+            f"""
+                    SELECT
+                        id, name, description, thumbnail, root_pk, version, last_updated,
+                        min_schema_version
+                    FROM {CHANNEL_TABLE}
+                """
+        ).fetchone()
+        lang_id, _ = self.cursor.execute(
+            f"""
+                SELECT lang_id, COUNT(id) AS node_by_lang_count
+                FROM {NODE_TABLE}
+                ORDER BY node_by_lang_count DESC
+            """
+        ).fetchone()
+        channel, is_new = models.Channel.objects.get_or_create(
+            pk=self.target_id, actor_id=self.editor_user.id
+        )
+        channel.name = name
+        channel.description = description
+        channel.language_id = lang_id
+        channel.thumbnail = write_to_thumbnail_file(thumbnail)
+        channel.thumbnail_encoding = {"base64": thumbnail, "points": [], "zoom": 0}
+        channel.version = version
+        channel.public = self.public
+        channel.save()
+        self.logger.info(f"\tCreated channel {self.target_id} with name {name}")
+        return channel, root_pk
+
+    def _create_nodes(self, parent, indent=1):
+        """
+        Create node(s) for a channel with target id
+
+        :param parent: node's parent
+        :param indent: How far to indent print statements
+        """
+        sql_command = f"""
+            SELECT
+                id, title, content_id, description, sort_order, license_owner, author, license_id,
+                kind, coach_content, lang_id, grade_levels, resource_types, learning_activities,
+                accessibility_labels, categories, learner_needs, duration, options
+            FROM {NODE_TABLE}
+            WHERE parent_id = ?
+            ORDER BY sort_order;
+        """
+        query = self.cursor.execute(
+            sql_command, (getattr(parent, "node_id", parent),)
+        ).fetchall()
+
+        # Parse through rows and create models
+        for (
+            id,
+            title,
+            content_id,
+            description,
+            sort_order,
+            license_owner,
+            author,
+            license_id,
+            kind,
+            coach_content,
+            lang_id,
+            grade_levels,
+            resource_types,
+            learning_activities_,
+            accessibility_labels,
+            categories,
+            learner_needs,
+            duration,
+            options,
+        ) in query:
+            self.logger.info(
+                "{indent} {id} ({title} - {kind})...".format(
+                    indent="   |" * indent, id=id, title=title, kind=kind
+                )
+            )
+
+            # Determine role
+            role = roles.LEARNER
+            if coach_content:
+                role = roles.COACH
+
+            # Determine extra_fields
+            extra_fields = {}
+            if kind == content_kinds.EXERCISE:
+                randomize_sql = f"""
+                    SELECT randomize
+                    FROM {ASSESSMENTMETADATA_TABLE}
+                    WHERE contentnode_id = ?
+                """
+                randomize = self.cursor.execute(randomize_sql, (id,)).fetchone()
+                extra_fields["options"] = json.loads(options) if options else {}
+                extra_fields["randomize"] = bool(randomize[0]) if randomize else False
+                completion_criteria_ = extra_fields["options"].get(
+                    "completion_criteria"
+                )
+                if (
+                    completion_criteria_
+                    and completion_criteria_.get("model") == completion_criteria.MASTERY
+                ):
+                    mastery_model = completion_criteria_.get("threshold", {}).get(
+                        "mastery_model"
+                    )
+                    if mastery_model == mastery_criteria.DO_ALL:
+                        completion_criteria_["threshold"] = {
+                            "mastery_model": mastery_model,
+                        }
+                if (
+                    completion_criteria_
+                    and "learner_managed" not in completion_criteria_
+                ):
+                    completion_criteria_["learner_managed"] = False
+
+            # Determine license
+            license_result = self._retrieve_license(license_id)
+            license_description = license_result[1] if license_result else ""
+            license_result = license_result[0] if license_result else None
+
+            # TODO: Determine thumbnail encoding
+
+            # Create the new node model
+            node = models.ContentNode.objects.create(
+                node_id=id,
+                original_source_node_id=id,
+                source_node_id=id,
+                title=title,
+                content_id=content_id,
+                description=description,
+                sort_order=sort_order,
+                copyright_holder=license_owner,
+                author=author,
+                license=license_result,
+                license_description=license_description,
+                language_id=lang_id,
+                role_visibility=role,
+                extra_fields=extra_fields,
+                kind_id=kind,
+                parent=parent,
+                original_channel_id=self.target_id,
+                source_channel_id=self.target_id,
+                grade_levels=convert_metadata_to_dict(grade_levels),
+                resource_types=convert_metadata_to_dict(resource_types),
+                learning_activities=convert_learning_activities_to_dict(
+                    kind, learning_activities_
+                ),
+                accessibility_labels=convert_metadata_to_dict(accessibility_labels),
+                categories=convert_metadata_to_dict(categories),
+                learner_needs=convert_metadata_to_dict(learner_needs),
+            )
+
+            # Handle foreign key references (children, files, tags)
+            if kind == content_kinds.TOPIC:
+                self._create_nodes(node, indent=indent + 1)
+            elif kind == content_kinds.EXERCISE:
+                self._create_assessment_items(node, indent=indent + 1)
+            self._create_files(node, indent=indent + 1)
+            self._create_tags(node, indent=indent + 1)
+
+            errors = node.mark_complete()
+            if errors:
+                self.logger.warning(f"Node {node.node_id} has errors: {errors}")
+            node.save()
+
+    def _retrieve_license(self, license_id):
+        """
+        Get license based on id from exported db
+
+        :param license_id: id of license on exported db
+        :return: license model matching the id and the associated license description
+        :rtype: tuple
+        """
+        # Handle no license being assigned
+        if license_id is None or license_id == "":
+            return None
+
+        # Return license that matches name
+        name, description = self.cursor.execute(
+            f"""
+                SELECT license_name, license_description
+                FROM {LICENSE_TABLE}
+                WHERE id = ?
+            """,
+            (license_id,),
+        ).fetchone()
+        return models.License.objects.get(license_name=name), description
+
+    def _create_files(self, contentnode, indent=0):
+        """
+        Create and possibly download node files
+
+        :param contentnode: node file references
+        :param indent: How far to indent print statements
+        """
+        # Parse database for files referencing content node and make file models
+        sql_command = f"""
+            SELECT checksum, extension, file_size, contentnode_id, lang_id, preset, thumbnail
+            FROM {FILE_TABLE}
+            WHERE contentnode_id = ?;
+        """
+        query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
+
+        for (
+            checksum,
+            extension,
+            file_size,
+            contentnode_id,
+            lang_id,
+            preset,
+            is_thumbnail,
+        ) in query:
+            filename = "{}.{}".format(checksum, extension)
+            self.logger.info(
+                "{indent} * FILE {filename}...".format(
+                    indent="   |" * indent, filename=filename
+                )
+            )
 
-    return data['content']
+            try:
+                self._download_file(
+                    filename,
+                    contentnode=contentnode,
+                    preset=preset,
+                    file_size=file_size,
+                    lang_id=lang_id,
+                    is_thumbnail=is_thumbnail,
+                )
+            except IOError as e:
+                self.logger.warning("\b FAILED (check logs for more details)")
+                if e.errno:
+                    sys.stderr.write(
+                        f"Restoration Process Error: Failed to save file object {filename}: {os.strerror(e.errno)}"
+                    )
+                continue
+
+    def _download_file(
+        self,
+        filename,
+        contentnode=None,
+        assessment_item=None,
+        preset=None,
+        file_size=None,
+        lang_id=None,
+        is_thumbnail=False,
+    ):
+        """
+        Create and possibly download a file from source instance and save to local storage
+
+        :param filename: the name of the file to download
+        :param contentnode: the associated content node
+        :param assessment_item: the associated assessment item
+        :param preset: the format preset for the file
+        :param file_size: the known size of the file
+        :param lang_id: the language ID of the file
+        :param is_thumbnail: whether the file is a thumbnail
+        """
+        checksum, extension = os.path.splitext(filename)
+        extension = extension.lstrip(".")
+        filepath = models.generate_object_storage_name(checksum, filename)
+
+        file_url = f"/content/storage/{filename[0]}/{filename[1]}/{filename}"
+        file_exists = False
+
+        # If the file already exists, get the size from the storage
+        if default_storage.exists(filepath):
+            file_size = file_size or default_storage.size(filepath)
+            file_exists = True
+        # if it needs downloading and if we were instructed to do so
+        elif self.download_content or (is_thumbnail and contentnode):
+            buffer = BytesIO()
+            response = self.client.get(file_url)
+            for chunk in response:
+                buffer.write(chunk)
+
+            if is_thumbnail and contentnode:
+                # If the file is a thumbnail, save it to the content node
+                contentnode.thumbnail_encoding = json.dumps(
+                    {
+                        "base64": get_thumbnail_encoding(filename, input_buffer=buffer),
+                        "points": [],
+                        "zoom": 0,
+                    }
+                )
+            else:
+                checksum, _, filepath = write_raw_content_to_storage(
+                    buffer.getvalue(), ext=extension
+                )
+                buffer.close()
+                file_exists = True
+        # otherwise, if file size is not known, get it from the response headers
+        elif not file_size:
+            response = self.client.head(file_url)
+            file_size = int(response.headers.get("Content-Length", 0))
+
+        # Save values to a new file object
+        file_obj = models.File(
+            file_format_id=extension,
+            file_size=file_size,
+            contentnode=contentnode,
+            assessment_item=assessment_item,
+            language_id=lang_id,
+            preset_id=preset or "",
+            checksum=checksum,
+        )
+        file_obj.file_on_disk.name = filepath
+        # set_by_file_on_disk: skip unless the file has been downloaded
+        file_obj.save(set_by_file_on_disk=file_exists)
+
+    def _create_tags(self, contentnode, indent=0):
+        """
+        Create tags associated with node
+
+        :param contentnode: node tags reference
+        :param indent: How far to indent print statements
+        """
+        # Parse database for files referencing content node and make file models
+        sql_command = f"""
+            SELECT ct.id, ct.tag_name
+            FROM {NODE_TAG_TABLE} cnt
+            JOIN {TAG_TABLE} ct ON cnt.contenttag_id = ct.id
+            WHERE cnt.contentnode_id = ?;
+        """
+        query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
+
+        # Build up list of tags
+        tag_list = []
+        for id, tag_name in query:
+            self.logger.info(
+                "{indent} ** TAG {tag}...".format(indent="   |" * indent, tag=tag_name)
+            )
+            # Save values to new or existing tag object
+            tag_obj, is_new = models.ContentTag.objects.get_or_create(
+                pk=id,
+                tag_name=tag_name,
+                channel_id=self.target_id,
+            )
+            tag_list.append(tag_obj)
+
+        # Save tags to node
+        contentnode.tags.set(tag_list)
+        contentnode.save()
+
+    def _create_assessment_items(self, contentnode, indent=0):
+        """
+        Generate assessment items based on perseus zip
+
+        :param contentnode: node assessment items reference
+        :param indent: How far to indent print statements
+        """
+        if not self.token:
+            self.logger.warning(
+                f"Skipping assessment items for node {contentnode.node_id}"
+            )
+            return
+
+        # first obtain the content node's Studio ID with the node ID
+        node_response = self.client.get_with_token(
+            f"/api/contentnode?_node_id_channel_id___in={contentnode.node_id},{self.source_id}"
+        )
+        if node_response.status_code != 200:
+            self.logger.warning(
+                f"Failed to obtain assessment items for node {contentnode.node_id}"
+            )
+            return
+
+        node_data = node_response.json()
+        contentnode_id = node_data[0]["id"] if node_data else None
+        if not contentnode_id:
+            self.logger.warning(f"No content node found for node {contentnode.node_id}")
+            return
+
+        # Get the content node's assessment items
+        assessment_response = self.client.get_with_token(
+            f"/api/assessmentitem?contentnode__in={contentnode_id}"
+        )
+        if assessment_response.status_code != 200:
+            self.logger.warning(
+                f"Failed to obtain assessment items for node {contentnode.node_id}"
+            )
+            return
+
+        assessment_items = assessment_response.json()
+        if not assessment_items:
+            self.logger.warning(
+                f"No assessment items found for node {contentnode.node_id}"
+            )
+            return
+
+        # Create the assessment items
+        for item in assessment_items:
+            self.logger.info(
+                "{indent} ** ASSESSMENT ITEM {assessment_id}...".format(
+                    indent="   |" * indent, assessment_id=item["assessment_id"]
+                )
+            )
+            assessment_item = models.AssessmentItem.objects.create(
+                assessment_id=item["assessment_id"],
+                type=item["type"],
+                order=item["order"],
+                question=item["question"],
+                answers=item["answers"],
+                hints=item["hints"],
+                randomize=item.get("randomize", False),
+            )
+            contentnode.assessment_items.add(assessment_item)
+        contentnode.save()
+
+    def _process_assessment_images(self, assessment_item):
+        """
+        Process images in assessment items and save them to the database.
+
+        :param assessment_item: The assessment item to process.
+        """
+        if not self.download_content:
+            # Skip if not downloading content
+            return
+
+        for content in [
+            assessment_item.question,
+            assessment_item.answers,
+            assessment_item.hints,
+        ]:
+            for match in re.finditer(exercise_image_filename_regex, content):
+                # Save files to db
+                self._download_file(
+                    match.group(3),
+                    assessment_item=assessment_item,
+                    preset=format_presets.EXERCISE,
+                )

From a8e64dbda06d24af0cae76a9cc8e9fb3207d0371 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 13:04:08 -0700
Subject: [PATCH 2/8] Update documentation and utilities for always running
 nginx in front of devserver

---
 .run/devserver.run.xml                    |   2 +-
 Makefile                                  |  45 +++++--
 contentcuration/contentcuration/models.py |  42 +-----
 docker-compose.yml                        |  16 +--
 docs/_index.md                            |   5 +-
 docs/host_services_setup.md               |  42 +-----
 docs/local_dev.md                         | 157 ++++++++++++++++++++++
 docs/local_dev_docker.md                  |  93 -------------
 docs/local_dev_host.md                    | 146 --------------------
 docs/local_dev_wsl.md                     | 117 ++--------------
 package.json                              |  12 --
 webpack.config.js                         |  34 +++--
 12 files changed, 240 insertions(+), 471 deletions(-)
 create mode 100644 docs/local_dev.md
 delete mode 100644 docs/local_dev_docker.md
 delete mode 100644 docs/local_dev_host.md

diff --git a/.run/devserver.run.xml b/.run/devserver.run.xml
index 1c94ee6402..55b6546404 100644
--- a/.run/devserver.run.xml
+++ b/.run/devserver.run.xml
@@ -13,7 +13,7 @@
     <option name="ADD_SOURCE_ROOTS" value="false" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
     <option name="SCRIPT_NAME" value="contentcuration/manage.py" />
-    <option name="PARAMETERS" value="runserver --settings=contentcuration.dev_settings 0.0.0.0:8080" />
+    <option name="PARAMETERS" value="runserver --settings=contentcuration.dev_settings 0.0.0.0:8081" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />
diff --git a/Makefile b/Makefile
index 002d337323..0e9f16d0d8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,13 @@
-# standalone install method
-DOCKER_COMPOSE = docker-compose
+SHELL := /bin/bash
 
-# support new plugin installation for docker-compose
-ifeq (, $(shell which docker-compose))
+# new plugin installation method for docker compose
 DOCKER_COMPOSE = docker compose
+
+WEBPACK_CMD = $(if $(filter $(WEBPACK_MODE),hot),pnpm run build:dev:hot, pnpm run build:dev)
+
+# support fallback to old docker-compose
+ifeq (, $(shell $DOCKER_COMPOSE version 2>/dev/null))
+	DOCKER_COMPOSE = docker-compose
 endif
 
 ###############################################################
@@ -133,9 +137,6 @@ dummyusers:
 	cd contentcuration/ && python manage.py loaddata contentcuration/fixtures/admin_user.json
 	cd contentcuration/ && python manage.py loaddata contentcuration/fixtures/admin_user_token.json
 
-hascaptions:
-	python contentcuration/manage.py set_orm_based_has_captions
-
 BRANCH_NAME := $(shell git rev-parse --abbrev-ref HEAD | sed 's/[^a-zA-Z0-9_-]/-/g')
 
 export COMPOSE_PROJECT_NAME=studio_$(BRANCH_NAME)
@@ -149,9 +150,31 @@ destroy-and-recreate-database: purge-postgres setup
 devceleryworkers:
 	$(MAKE) -e DJANGO_SETTINGS_MODULE=contentcuration.dev_settings prodceleryworkers
 
-run-services:
+devrun-django:
+	python contentcuration/manage.py runserver --settings=contentcuration.dev_settings 0.0.0.0:8081
+
+devrun-server:
+	set -ex; \
+	function _on_interrupt() { $(DOCKER_COMPOSE) stop studio-nginx; }; \
+	trap _on_interrupt SIGINT SIGTERM SIGKILL ERR; \
+	$(DOCKER_COMPOSE) up -d studio-nginx; \
+	$(MAKE) -j 2 devrun-django devrun-webpack
+
+devrun-server-hot:
+	$(MAKE) -e devrun-server WEBPACK_MODE=hot
+
+devrun-services:
 	$(MAKE) -j 2 dcservicesup devceleryworkers
 
+devrun-setup:
+	python contentcuration/manage.py setup --settings=contentcuration.dev_settings
+
+devrun-shell:
+	python contentcuration/manage.py shell --settings=contentcuration.dev_settings
+
+devrun-webpack:
+	$(WEBPACK_CMD)
+
 .docker/minio:
 	mkdir -p $@
 
@@ -172,7 +195,7 @@ dcbuild:
 
 dcup: .docker/minio .docker/postgres
 	# run all services except for cloudprober
-	$(DOCKER_COMPOSE) up studio-app celery-worker
+	$(DOCKER_COMPOSE) up studio-nginx studio-app
 
 dcup-cloudprober: .docker/minio .docker/postgres
 	# run all services including cloudprober
@@ -200,8 +223,8 @@ dctest: .docker/minio .docker/postgres
 
 dcservicesup: .docker/minio .docker/postgres
 	# launch all studio's dependent services using docker-compose
-	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.alt.yml up minio postgres redis
+	$(DOCKER_COMPOSE) up minio postgres redis
 
 dcservicesdown:
 	# stop services that were started using dcservicesup
-	$(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.alt.yml down
+	$(DOCKER_COMPOSE) down
diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py
index d10f3ac2b0..bc68573b5e 100644
--- a/contentcuration/contentcuration/models.py
+++ b/contentcuration/contentcuration/models.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import os
-import urllib.parse
 import uuid
 from datetime import datetime
 
@@ -581,44 +580,9 @@ def generate_storage_url(filename, request=None, *args):
 
     path = generate_object_storage_name(os.path.splitext(filename)[0], filename)
 
-    # There are three scenarios where Studio might be run as:
-    #
-    # 1. In normal kubernetes, nginx will proxy for us. We'll know we're in kubernetes when the
-    # environment variable RUN_MODE=k8s
-    #
-    # 2. In Docker Compose and bare metal runserver, we'll be running in runserver, and minio
-    # will be exposed in port 9000 in the host's localhost network.
-
-    # Note (aron): returning the true storage URL (e.g. https://storage.googleapis.com/storage/a.mp4)
-    # isn't too important, because we have CDN in front of our servers, so it should be cached.
-    # But change the logic here in case there is a potential for bandwidth and latency improvement.
-
-    # Detect our current state first
-    run_mode = os.getenv("RUN_MODE")
-
-    # if we're running inside k8s, then just serve the normal /content/{storage,databases} URL,
-    # and let nginx handle proper proxying.
-    if run_mode == "k8s":
-        url = "/content/{path}".format(
-            path=path,
-        )
-
-    # if we're in docker-compose or in baremetal, just return the object storage URL as localhost:9000
-    elif run_mode == "docker-compose" or run_mode is None:
-        # generate the minio storage URL, so we can get the GET parameters that give everyone
-        # access even if they don't need to log in
-        params = urllib.parse.urlparse(default_storage.url(path)).query
-        host = "localhost"
-        port = 9000  # hardcoded to the default minio IP address
-        url = "http://{host}:{port}/{bucket}/{path}?{params}".format(
-            host=host,
-            port=port,
-            bucket=settings.AWS_S3_BUCKET_NAME,
-            path=path,
-            params=params,
-        )
-
-    return url
+    return "/content/{path}".format(
+        path=path,
+    )
 
 
 class FileOnDiskStorage(FileSystemStorage):
diff --git a/docker-compose.yml b/docker-compose.yml
index e1b1e35c6f..ae83fbb645 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.4'
-
 x-studio-environment:
   &studio-environment
     MPLBACKEND: ps
@@ -14,7 +12,8 @@ x-studio-environment:
     CELERY_BROKER_ENDPOINT: redis
     CELERY_RESULT_BACKEND_ENDPOINT: redis
     CELERY_REDIS_PASSWORD: ""
-    PROBER_STUDIO_BASE_URL: http://studio-app:8080/{path}
+    PROBER_STUDIO_BASE_URL: http://studio-app:8081/{path}
+    WEBPACK_DEV_HOST: 0.0.0.0
 
 x-studio-worker:
   &studio-worker
@@ -36,10 +35,7 @@ services:
     build:
       context: .
       dockerfile: k8s/images/nginx/Dockerfile
-    ports:
-      - "8081:8080"
-    depends_on:
-      - studio-app
+    network_mode: host
     environment: *studio-environment
 
   studio-app:
@@ -47,7 +43,7 @@ services:
     entrypoint: python docker/entrypoint.py
     command: pnpm run devserver
     ports:
-      - "8080:8080"
+      - "8081:8081"
       - "4000:4000"
 
   celery-worker:
@@ -71,6 +67,8 @@ services:
     build:
       context: ./docker
       dockerfile: Dockerfile.postgres.dev
+    ports:
+      - "5432:5432"
     environment:
       PGDATA: /var/lib/postgresql/data/pgdata
       POSTGRES_USER: learningequality
@@ -82,6 +80,8 @@ services:
 
   redis:
     image: redis:6.0.9
+    ports:
+      - "6379:6379"
 
   cloudprober:
     <<: *studio-worker
diff --git a/docs/_index.md b/docs/_index.md
index c3e67006ba..03a5e49aea 100644
--- a/docs/_index.md
+++ b/docs/_index.md
@@ -2,12 +2,11 @@
 
 ## Local development guides
 
-- [Local development instructions: With Docker (recommended)](./local_dev_docker.md)
-- [Local development instructions: Run everything on your host machine](./local_dev_host.md)
+- [Local development instructions](./local_dev.md)
 - [Local development tools](./dev_tools.md)
 - [Running tests](./running_tests.md)
 - [Adding or updating dependencies](./dependencies.md)
-- [Preparing a pull request](../pull_requests.md)
+- [Preparing a pull request](./pull_requests.md)
 
 ## Additional development tools
 
diff --git a/docs/host_services_setup.md b/docs/host_services_setup.md
index 451c3b3c95..528c5e4442 100644
--- a/docs/host_services_setup.md
+++ b/docs/host_services_setup.md
@@ -1,16 +1,10 @@
-# Local development instructions: run everything on your host machine
+# Supplemental instructions for host services
 
-This guide will walk through setting up Kolibri Studio for local development, where you'll run Studio's Python apps and all of Studio's services on your host machine, without the need for docker.
+This guide is a supplement to Kolibri Studio's [local development instructions](./local_dev.md) and provides additional notes and instructions for setting up Kolibri Studio's services manually.
 
 ## Prerequisites
-For detailed instructions on installing and configuring Volta, pyenv, and pyenv-virtualenv, please see the [Prerequisites](./local_dev_host.md#prerequisites) section in our Local Development with host guide.
 
 ## Install system dependencies and services
-Studio requires some background services to be running:
-
-* Minio - a local S3 storage emulation
-* PostgreSQL - a relational database
-* Redis - a fast key/value store useful for caching
 
 ### Ubuntu or Debian
 ```bash
@@ -39,14 +33,7 @@ brew link --force imagemagick@6
 
 ## Set up the database
 
-Make sure postgres is running:
-
-```bash
-service postgresql start
-# alternatively: pg_ctl -D /usr/local/var/postgresql@16 start
-```
-
-Start the client with:
+Once you've started postgres, access the postgres client with:
 
 ```bash
 sudo su postgres  # switch to the postgres account
@@ -70,26 +57,3 @@ Press <kbd>Ctrl</kbd>+<kbd>D</kbd> to exit the `psql` client. Finally
 ```bash
 exit  # leave the postgres account
 ```
-
-## Build your python virtual environment
-For complete instructions on installing Python 3.10.13, creating and activating the virtual environment, and installing Studio’s Python dependencies, please refer to the [Build Your Python Virtual Environment](./local_dev_host.md#build-your-python-virtual-environment) section in our Local Development with host guide.
-
-### A note about `psycopg2`
-The packages `postgresql-16`, `postgresql-contrib`, and `postgresql-server-dev-all` are required to build `psycopg2` python driver.
-
-### A note about dependencies on Apple Silicon M1+
-If you run into an error with `pip install` related to the `grcpio` package, it is because it currently [does not support M1 with the version for `grcpio` Studio uses](https://github.com/grpc/grpc/issues/25082). In order to fix it, you will need to add the following environmental variables before running `pip install`:
-```bash
-export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
-export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
-export CFLAGS="-I/opt/homebrew/opt/openssl/include"
-export LDFLAGS="-L/opt/homebrew/opt/openssl/lib"
-```
-
-## Install frontend dependencies
-For guidance on installing Node 18.X, pnpm, and all required frontend dependencies, running the services, initializing Studio, and running the development server , please refer to the [Install Frontend Dependencies](./local_dev_host.md#install-frontend-dependencies) section in our Local Development with host guide.
-
-Either of the above commands will take a few minutes to build the frontend. When it's done, you can sign in with the account created by the `pnpm run devsetup` command:
-- url: `http://localhost:8080/accounts/login/`
-- username: `a@a.com`
-- password: `a`
diff --git a/docs/local_dev.md b/docs/local_dev.md
new file mode 100644
index 0000000000..520270bc61
--- /dev/null
+++ b/docs/local_dev.md
@@ -0,0 +1,157 @@
+# Local development instructions
+
+The following guide follows the preferred method of running Kolibri Studio locally for development purposes. This guide allows you to run all of Studio's services in docker containers, and the python and webpack servers on your host.
+
+**Note:** If you are developing on Windows or would rather take a manual approach to installation, please see the supplemental documentation in the [WSL setup guide](./local_dev_wsl.md) and [host services setup guide](./host_services_setup.md).
+
+## Prerequisites
+The following tools are required to run Studio locally:
+- [git](https://git-scm.com/)
+- [docker engine](https://docs.docker.com/engine/install/) (community edition)
+- [docker Compose](https://docs.docker.com/compose/install/)
+- [volta](https://docs.volta.sh/guide/getting-started)
+- [pyenv](https://kolibri-dev.readthedocs.io/en/develop/howtos/installing_pyenv.html) and [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv#installation)
+
+## Build your python virtual environment
+Once you've cloned the repository, you can set up a python virtual environment using `pyenv` and `pyenv-virtualenv`. This is the recommended way to manage your python versions and virtual environments.
+
+To determine what version of Python studio needs, you can check the `runtime.txt` file:
+```bash
+$ cat runtime.txt
+# This is the required version of Python to run Studio currently.
+# This is determined by the default Python 3 version that is installed
+# inside Ubuntu Bionic, which is used to build images for Studio.
+# We encode it here so that it can be picked up by Github's dependabot
+# to manage automated package upgrades.
+python-3.10.13
+```
+
+So to install python 3.10.13 through `pyenv` and set up a virtual environment:
+```bash
+pyenv install 3.10.13
+pyenv virtualenv 3.10.13 studio-py3.10
+pyenv activate studio-py3.10
+```
+
+Now you may install Studio's Python dependencies:
+```bash
+pip install -r requirements.txt -r requirements-dev.txt
+```
+
+To deactivate the virtual environment, when you're finished developing on Studio for the time being:
+```bash
+pyenv deactivate
+```
+
+### A note about `psycopg2`
+The packages `postgresql-16`, `postgresql-contrib`, and `postgresql-server-dev-all` may be required to build the `psycopg2` python driver.
+
+### A note about dependencies on Apple Silicon M1+
+If you run into an error with `pip install` related to the `grcpio` package, it is because it currently [does not support M1 with the version for `grcpio` Studio uses](https://github.com/grpc/grpc/issues/25082). In order to fix it, you will need to add the following environmental variables before running `pip install`:
+```bash
+export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
+export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
+export CFLAGS="-I/opt/homebrew/opt/openssl/include"
+export LDFLAGS="-L/opt/homebrew/opt/openssl/lib"
+```
+
+## Install frontend dependencies
+The project requires `Node 18.X` as the runtime and `pnpm` as the package manager. You can make use of [`Volta`](https://docs.volta.sh/guide/getting-started) to manage the nodejs version, and pnpm with a few extra steps.
+
+### Volta installation of pnpm
+If you have Volta installed, you can use it to install pnpm:
+```bash
+export VOLTA_FEATURE_PNPM=1
+volta install pnpm
+```
+
+### Corepack installation of pnpm
+If you have Corepack installed, you can use it to install pnpm:
+```bash
+corepack enable
+corepack install
+```
+
+### Completing the installation
+Once `pnpm` is installed, you can install all the dependencies by running:
+```bash
+pnpm install
+```
+
+## Install and run services
+
+Studio requires some background services to be running:
+
+* Minio - a local storage emulation
+* PostgreSQL (postgres) - a relational database
+* Redis - a fast key/value store useful for caching
+* Celery - the task manager and executor, which relies on the Studio codebase
+
+Before starting the services, you'll want to make sure any other services that may be using the same ports are stopped. For example, if you have a local postgres server running, you'll want to stop it before starting the docker-based services.
+
+Generally speaking, you'll want to open a separate terminal/terminal-tab to run the services. With docker and docker-compose installed, running the above services is as easy as:
+```bash
+make devrun-services
+```
+
+The above command may take longer the first time it's run. It includes starting the `celery` workers (please see 'Running the celery service' section below). You may use the following commands to start the services and the celery workers separately:
+
+```bash
+make dcservicesup
+make devceleryworkers
+```
+
+To confirm that docker-based services are running, you should see three or more containers when executing `docker ps`. For example:
+
+```bash
+> docker ps
+CONTAINER ID        IMAGE                             COMMAND                  CREATED             STATUS              PORTS                    NAMES
+e09c5c203b93        redis:6.0.9                       "docker-entrypoint.s…"   51 seconds ago      Up 49 seconds       0.0.0.0:6379->6379/tcp   studio_vue-refactor_redis_1
+6164371efb6b        minio/minio                       "minio server /data"     51 seconds ago      Up 49 seconds       0.0.0.0:9000->9000/tcp   studio_vue-refactor_minio_1
+c86bbfa3a59e        postgres:12.10                      "docker-entrypoint.s…"   51 seconds ago      Up 49 seconds       0.0.0.0:5432->5432/tcp   studio_vue-refactor_postgres_1
+```
+
+To stop the services, press <kbd>Ctrl</kbd> + <kbd>C</kbd> in the terminal where you ran `make devrun-services` (or `dcservicesup`). Once you've done that, you may run the following command to remove the docker containers (they will be recreated when you run `devrun-services` or `dcservicesup` again):
+```bash
+make dcservicesdown
+```
+
+Lastly, the volumes used by minio and postgres are not removed when you run `dcservicesdown`. If you want to remove them, you can run the following command:
+```bash
+make dcclean
+```
+
+## Initializing Studio
+With the services running, in a separate terminal/terminal-tab, we can now initialize the database for Studio development purposes. The command below will initialize the database tables, import constants, enable required postgres extensions and a studio user account for development:
+```bash
+make devrun-setup
+```
+
+## Running the development server
+With the services running, in a separate terminal/terminal-tab, and the database initialized, we can start the dev server:
+```bash
+make devrun-server-hot  # with Vue hot module reloading
+# or
+make devrun-server  # without hot module reloading
+```
+
+### Running within docker
+If you want to run the development server within docker, you can use the following command:
+```bash
+make dcup
+```
+
+Either of the above commands will take a few moments to build the frontend. When it finishes, you can sign in with the account created by the `make devrun-setup` command:
+- url: `http://localhost:8080/accounts/login/`
+- username: `a@a.com`
+- password: `a`
+
+## Running the celery service
+Studio uses `celery` for executing asynchronous tasks, which are integral to Studio's channel editing architecture. The celery service does not reload when there are Python changes like the Django devserver does, so it's often preferred to run it separately. If you are developing changes against a task or the celery configuration, you'll need to use `make dcservicesup` to run only the docker-based services.
+
+In a separate terminal/terminal-tab, run the following to start the service and press <kbd>Ctrl</kbd> + <kbd>C</kbd> to stop it:
+```bash
+make devceleryworkers
+```
+
+Stop and restart the above to reload your changes.
diff --git a/docs/local_dev_docker.md b/docs/local_dev_docker.md
deleted file mode 100644
index ccd3cad729..0000000000
--- a/docs/local_dev_docker.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Local development instructions: With Docker (recommended)
-
-The following guide utilizes docker and docker-compose to run select services required for Studio to function. It's our recommended setup. However, if you would rather install these services on your host, please follow the [host-setup guide](./local_dev_host.md).
-
-**Note:** If you are developing on Windows, it is recommended to use WSL (Windows Subsystem for Linux). Please follow the [WSL setup guide](./local_dev_wsl.md) for detailed instructions.
-
-## Prerequisites
-For detailed instructions on installing and configuring Volta, pyenv, and pyenv-virtualenv, please see the [Prerequisites](./local_dev_host.md#prerequisites) section in our Local Development with host guide.
-
-## Build your python virtual environment
-For complete instructions on installing Python 3.10.13, creating and activating the virtual environment, and installing Studio’s Python dependencies, please refer to the [Build Your Python Virtual Environment](./local_dev_host.md#build-your-python-virtual-environment) section in our Local Development with host guide.
-
-
-### A note about dependencies on Apple Silicon M1+
-If you run into an error with `pip install` related to the `grcpio` package, it is because it currently [does not support M1 with the version for `grcpio` Studio uses](https://github.com/grpc/grpc/issues/25082). In order to fix it, you will need to add the following environmental variables before running `pip install`:
-```bash
-export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
-export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
-export CFLAGS="-I/opt/homebrew/opt/openssl/include"
-export LDFLAGS="-L/opt/homebrew/opt/openssl/lib"
-```
-
-## Install frontend dependencies
-The project requires `Node 18.X` as the runtime and `pnpm` as the package manager. We make use of [`Volta`](https://docs.volta.sh/guide/getting-started) to manage the same automatically. Please make sure you have volta installed and your shell configured to use volta. You can then install all the dependencies by running:
-```bash
-corepack use pnpm # or `volta install pnpm`
-pnpm install
-```
-
-## Install and run services
-
-Studio requires some background services to be running:
-
-* Minio - a local S3 storage emulation
-* PostgreSQL (postgres) - a relational database
-* Redis - a fast key/value store useful for caching
-* Celery - the task manager and executor, which relies on the Studio codebase
-
-Generally speaking, you'll want to open a separate terminal/terminal-tab to run the services. With docker and docker-compose installed, running the above services is as easy as:
-```bash
-make run-services
-```
-
-The above command may take longer the first time it's run. It includes starting the `celery` workers, and the other dependent services through docker, which can be done separately with the following two commands:
-
-```bash
-make dcservicesup
-make devceleryworkers
-```
-
-To confirm that docker-based services are running, you should see three containers when executing `docker ps`. For example:
-
-```bash
-> docker ps
-CONTAINER ID        IMAGE                             COMMAND                  CREATED             STATUS              PORTS                    NAMES
-e09c5c203b93        redis:6.0.9                       "docker-entrypoint.s…"   51 seconds ago      Up 49 seconds       0.0.0.0:6379->6379/tcp   studio_vue-refactor_redis_1
-6164371efb6b        minio/minio                       "minio server /data"     51 seconds ago      Up 49 seconds       0.0.0.0:9000->9000/tcp   studio_vue-refactor_minio_1
-c86bbfa3a59e        postgres:12.10                      "docker-entrypoint.s…"   51 seconds ago      Up 49 seconds       0.0.0.0:5432->5432/tcp   studio_vue-refactor_postgres_1
-```
-
-To stop the services, press <kbd>Ctrl</kbd> + <kbd>C</kbd> in the terminal where you ran `make run-services` (or `dcservicesup`). Once you've done that, you may run the following command to remove the docker containers (they will be recreated when you run `run-services` or `dcservicesup` again):
-```bash
-make dcservicesdown
-```
-
-## Initializing Studio
-With the services running, in a separate terminal/terminal-tab, we can now initialize the database for Studio development purposes. The command below will initialize the database tables, import constants, enable required postgres extensions and a studio user account for development:
-```bash
-pnpm run devsetup
-```
-
-## Running the development server
-With the services running, in a separate terminal/terminal-tab, and the database initialized, we can start the dev server:
-```bash
-pnpm run devserver:hot  # with Vue hot module reloading
-# or
-pnpm run devserver  # without hot module reloading
-```
-
-Either of the above commands will take a few moments to build the frontend. When it finishes, you can sign in with the account created by the `pnpm run devsetup` command:
-- url: `http://localhost:8080/accounts/login/`
-- username: `a@a.com`
-- password: `a`
-
-## Running the celery service
-Studio uses `celery` for executing asynchronous tasks, which are integral to Studio's channel editing architecture. The celery service does not reload when there are Python changes like the Django devserver does, so it's often preferred to run it separately. If you are developing changes against a task or the celery configuration, you'll need to use `make dcservicesup` to run only the docker-based services.
-
-In a separate terminal/terminal-tab, run the following to start the service and press <kbd>Ctrl</kbd> + <kbd>C</kbd> to stop it:
-```bash
-make devceleryworkers
-```
-
-Stop and restart the above to reload your changes.
diff --git a/docs/local_dev_host.md b/docs/local_dev_host.md
deleted file mode 100644
index 17e6243761..0000000000
--- a/docs/local_dev_host.md
+++ /dev/null
@@ -1,146 +0,0 @@
-# Local development instructions: Run everything on your host machine
-
-This guide will walk through setting up Kolibri Studio for local development, where you'll run Studio's Python apps and all of Studio's services on your host machine, without the need for docker.
-
-**Note:** If you are developing on Windows, it is recommended to use WSL (Windows Subsystem for Linux). Please follow the [WSL setup guide](./local_dev_wsl.md) for detailed instructions.
-
-## Prerequisites
-- [volta](https://docs.volta.sh/guide/getting-started)
-- [pyenv](https://kolibri-dev.readthedocs.io/en/develop/howtos/installing_pyenv.html) and [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv#installation)
-
-## Install system dependencies and services
-Studio requires some background services to be running:
-
-* Minio - a local S3 storage emulation
-* PostgreSQL - a relational database
-* Redis - a fast key/value store useful for caching
-
-### Ubuntu or Debian
-```bash
-# Install packages
-sudo apt-get install -y python-tk \
-    postgresql-server-dev-all postgresql-contrib postgresql-client postgresql-16 \
-    ffmpeg libmagickwand-dev redis-server wkhtmltopdf
-
-# Install minio
-wget https://dl.minio.io/server/minio/release/linux-amd64/minio -O bin/minio
-sudo chmod +x bin/minio
-```
-
-### Mac OS
-```bash
-brew install postgresql@16 redis ffmpeg imagemagick@6 gs
-# note, this version of minio may not be compatible with Studio
-brew install minio/stable/minio
-brew link --force postgresql@16
-brew link --force imagemagick@6
-```
-
-### Windows
-
-Windows is no longer supported due to incompatibilities with some required packages.
-
-## Set up the database
-
-Make sure postgres is running:
-
-```bash
-service postgresql start
-# alternatively: pg_ctl -D /usr/local/var/postgresql@16 start
-```
-
-Start the client with:
-
-```bash
-sudo su postgres  # switch to the postgres account
-psql  # mac: psql postgres
-```
-
-Create a database user with username `learningequality` and password `kolibri`:
-
-```sql
-CREATE USER learningequality with NOSUPERUSER INHERIT NOCREATEROLE CREATEDB LOGIN NOREPLICATION NOBYPASSRLS PASSWORD 'kolibri';
-  ```
-
-Create a database called `kolibri-studio`:
-
-```sql
-CREATE DATABASE "kolibri-studio" WITH TEMPLATE = template0 ENCODING = "UTF8" OWNER = "learningequality";
-```
-
-Press <kbd>Ctrl</kbd>+<kbd>D</kbd> to exit the `psql` client. Finally
-
-```bash
-exit  # leave the postgres account
-```
-
-## Build your python virtual environment
-To determine what version of Python studio needs, you can check the `runtime.txt` file:
-```bash
-$ cat runtime.txt
-# This is the required version of Python to run Studio currently.
-# This is determined by the default Python 3 version that is installed
-# inside Ubuntu Bionic, which is used to build images for Studio.
-# We encode it here so that it can be picked up by Github's dependabot
-# to manage automated package upgrades.
-python-3.10.13
-```
-So to install python 3.10.13 through `pyenv` and set up a virtual environment:
-```bash
-pyenv install 3.10.13
-pyenv virtualenv 3.10.13 studio-py3.10
-pyenv activate studio-py3.10
-```
-Now you may install Studio's Python dependencies:
-```bash
-pip install -r requirements.txt -r requirements-dev.txt
-```
-To deactivate the virtual environment, when you're finished developing on Studio for the time being:
-```bash
-pyenv deactivate
-```
-
-### A note about `psycopg2`
-The packages `postgresql-16`, `postgresql-contrib`, and `postgresql-server-dev-all` are required to build `psycopg2` python driver.
-
-### A note about dependencies on Apple Silicon M1+
-If you run into an error with `pip install` related to the `grcpio` package, it is because it currently [does not support M1 with the version for `grcpio` Studio uses](https://github.com/grpc/grpc/issues/25082). In order to fix it, you will need to add the following environmental variables before running `pip install`:
-```bash
-export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
-export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
-export CFLAGS="-I/opt/homebrew/opt/openssl/include"
-export LDFLAGS="-L/opt/homebrew/opt/openssl/lib"
-```
-
-## Install frontend dependencies
-The project requires `Node 18.X` as the runtime and `pnpm` as the package manager. We make use of [`Volta`](https://docs.volta.sh/guide/getting-started) to manage the same automatically. Please make sure you have volta installed and your shell configured to use volta. You can then install all the dependencies by running:
-```bash
-corepack use pnpm # or `volta install pnpm`
-pnpm install
-```
-
-## Run the services
-
-Having installed all the necessary services, initialized your python virtual environment, and installed `pnpm`, you're now ready to start the services. Generally speaking, you'll want to open a separate terminal/terminal-tab to run the services. The following will ensure all services are started, in addition to starting the celery workers service:
-```bash
-pnpm run services
-```
-
-## Initializing Studio
-With the services running, in a separate terminal/terminal-tab, we can now initialize the database for Studio development purposes. The command below will initialize the database, in addition to adding a user account for development:
-```bash
-pnpm run devsetup
-```
-
-## Running the development server
-With the services running, in a separate terminal/terminal-tab, and the database initialized, we can start the dev server:
-```bash
-pnpm run devserver:hot  # with Vue hot module reloading
-# or
-pnpm run devserver  # without hot module reloading
-```
-
-Either of the above commands will take a few minutes to build the frontend. When it's done, you can sign in with the account created by the `pnpm run devsetup` command:
-- url: `http://localhost:8080/accounts/login/`
-- username: `a@a.com`
-- password: `a`
diff --git a/docs/local_dev_wsl.md b/docs/local_dev_wsl.md
index e01e4ad75b..6c12c50b2c 100644
--- a/docs/local_dev_wsl.md
+++ b/docs/local_dev_wsl.md
@@ -1,23 +1,9 @@
-# Kolibri Studio Local Setup Guide Using WSL
+# Supplemental instructions for WSL
 
-This guide will walk you through setting up Kolibri Studio for local development using Windows Subsystem for Linux (WSL). We will cover everything from installing WSL and a Linux distribution to configuring your environment and running the necessary commands.
+This guide is a supplement to Kolibri Studio's [local development instructions](./local_dev.md), specifically for using Windows Subsystem for Linux (WSL).
 
-## Table of Contents
-
- 1. [Install WSL and Ubuntu](#install-wsl-and-ubuntu)
- 2. [Update and Upgrade Packages](#update-and-upgrade-packages)
- 3. [Install Prerequisites](#install-prerequisites)
- 4. [Install System Dependencies and Services](#install-system-dependencies-and-services)
- 5. [Set Up the Database](#set-up-the-database)
- 6. [Configure .bashrc](#configure-bashrc)
- 7. [Set Up Python Virtual Environment](#set-up-python-virtual-environment)
- 8. [Install Frontend Dependencies](#install-frontend-dependencies)
- 9. [Run the Services](#run-the-services)
-10. [Initialize Studio](#initialize-studio)
-11. [Run the Development Server](#run-the-development-server)
-12. [Access Kolibri Studio](#access-kolibri-studio)
-
-## Install WSL and Ubuntu
+## Preparing WSL and Ubuntu
+Before setting up Studio, you will need to prepare your WSL environment. This includes installing WSL (if not already) and Ubuntu, and setting up your Linux username and password.
 
 1. **Enable WSL**: Open PowerShell as Administrator and run the following command to enable WSL:
 
@@ -35,7 +21,7 @@ This guide will walk you through setting up Kolibri Studio for local development
    wsl --set-default-version 2
    ```
 
-## Update and Upgrade Packages
+### Update and Upgrade Packages
 
 Open your WSL terminal and update the package lists:
 
@@ -44,92 +30,7 @@ sudo apt update
 sudo apt upgrade -y
 ```
 
-Fork the repo and clone it by running the following command:
-
-```sh
-git clone https://github.com/$USERNAME/studio.git
-```
-
-Replace `$USERNAME` with your GitHub username.
-
-## Install Prerequisites
-For detailed instructions on installing and configuring Volta, pyenv, and pyenv-virtualenv, please see the [Prerequisites](./local_dev_host.md#prerequisites) section in our Local Development with host guide.
-
-## Install System Dependencies and Services
-
-Studio requires some background services to be running:
-
-* Minio - a local S3 storage emulation
-* PostgreSQL - a relational database
-* Redis - a fast key/value store useful for caching
-
-Install the necessary system dependencies and services:
-
-```sh
-sudo apt-get update
-sudo apt-get install -y python-tk postgresql-server-dev-all postgresql-contrib postgresql-client postgresql-12 ffmpeg libmagickwand-dev redis-server wkhtmltopdf
-```
-
-### Install MinIO
-
-Download and install MinIO:
-
-```sh
-wget https://dl.minio.io/server/minio/release/linux-amd64/minio -O bin/minio
-sudo chmod +x ~/bin/minio
-```
-
-## Set Up the Database
-
-### Start PostgreSQL Service
-
-Make sure PostgreSQL is running:
-
-```sh
-sudo service postgresql start
-```
-
-### Create Database and User
-
-Start the client with:
-
-```sh
-sudo su postgres
-psql
-```
-
-Run the following SQL commands:
-
-```sql
-CREATE USER learningequality WITH NOSUPERUSER INHERIT NOCREATEROLE CREATEDB LOGIN NOREPLICATION NOBYPASSRLS PASSWORD 'kolibri';
-CREATE DATABASE "kolibri-studio" WITH TEMPLATE = template0 ENCODING = "UTF8" OWNER = "learningequality";
-```
-
-Exit the PostgreSQL client:
-
-```sh
-\q
-exit
-```
-
-## Set Up Python Virtual Environment
-For complete instructions on installing Python 3.10.13, creating and activating the virtual environment, and installing Studio’s Python dependencies, please refer to the [Build Your Python Virtual Environment](./local_dev_host.md#build-your-python-virtual-environment) section in our Local Development with host guide.
-
-### Note about psycopg2
-
-The packages `postgresql-12`, `postgresql-contrib`, and `postgresql-server-dev-all` are required to build the `psycopg2` Python driver.
-
-## Install Frontend Dependencies
-
-For guidance on installing Node 18.X, pnpm, and all required frontend dependencies, running the services, initializing Studio, and running the development server , please refer to the [Install Frontend Dependencies](./local_dev_host.md#install-frontend-dependencies) section in our Local Development with host guide.
-
-## Access Kolibri Studio
-
-Either of the above commands will take a few minutes to build the frontend. When it's done, you can sign in with the account created by the `pnpm run devsetup` command:
-
-* URL: <http://localhost:8080/accounts/login/>
-* Username: `a@a.com`
-* Password: `a`
+You're now ready to fork the Kolibri Studio repository and set up your development environment using the [local development instructions](./local_dev.md).
 
 ## Contributing to the Codebase with Visual Studio Code and WSL
 
@@ -176,17 +77,17 @@ Now that you have your project open in VS Code, you can run the same commands yo
 3. **Run the Services**:
 
    ```sh
-   pnpm run services
+   make devrun-services
    ```
 4. **Initialize the Studio**:
 
    ```sh
-   pnpm run devsetup
+   make devrun-setup
    ```
 5. **Start the Development Server**:
 
    ```sh
-   pnpm run devserver:hot
+   make devrun-server
    ```
 
 By following these steps, you can set up a productive development environment in VS Code with WSL and start contributing to the Kolibri Studio codebase.
diff --git a/package.json b/package.json
index 1de555a706..d13e294188 100644
--- a/package.json
+++ b/package.json
@@ -13,24 +13,12 @@
     "combineprofiles": "node ./node_modules/kolibri-tools/lib/combineStringProfiles.js ./contentcuration/locale/en/LC_MESSAGES/profiles/",
     "transfercontext": "kolibri-tools i18n-transfer-context --namespace studio --searchPath contentcuration/contentcuration/frontend; pnpm lint-all:fix",
     "build": "webpack --env prod --config webpack.config.js",
-    "postgres": "pg_ctl -D /usr/local/var/postgresql@9.6 start || true",
-    "redis": "redis-server /usr/local/etc/redis.conf || true",
-    "devsetup": "python contentcuration/manage.py setup --settings=contentcuration.dev_settings",
-    "devsetup:clean": "python contentcuration/manage.py setup --clean-data-state --settings=contentcuration.dev_settings",
-    "services": "npm-run-all -c --parallel --silent celery minio redis postgres",
     "test": "jest --config jest_config/jest.conf.js",
     "build:dev": "webpack serve --env dev --config webpack.config.js --progress",
     "build:dev:hot": "pnpm run build:dev --hot --env hot",
     "test-jest:dev": "pnpm run test-jest --watch",
     "test-jest": "pnpm run test",
     "test-jest:debug": "node --inspect node_modules/.bin/jest --runInBand --watch",
-    "minio": "MINIO_API_CORS_ALLOW_ORIGIN='http://localhost:8080,http://127.0.0.1:8080' MINIO_ACCESS_KEY=development MINIO_SECRET_KEY=development minio server ~/.minio_data/ || true",
-    "runserver": "cd contentcuration && python manage.py runserver --settings=contentcuration.dev_settings 0.0.0.0:8080",
-    "devserver": "npm-run-all --parallel build:dev runserver",
-    "devserver:hot": "npm-run-all --parallel build:dev:hot runserver",
-    "devserver-hot": "pnpm run devserver:hot",
-    "devshell": "cd contentcuration && python manage.py shell --settings=contentcuration.dev_settings",
-    "celery": "(cd contentcuration && DJANGO_SETTINGS_MODULE=contentcuration.dev_settings celery -A contentcuration worker --without-mingle --without-gossip -c 1 -l info) || true",
     "storybook": "start-storybook",
     "storybook:debug": "start-storybook --debug-webpack",
     "storybook:build": "build-storybook",
diff --git a/webpack.config.js b/webpack.config.js
index d07dfcf483..685f62800d 100644
--- a/webpack.config.js
+++ b/webpack.config.js
@@ -16,7 +16,12 @@ const WebpackRTLPlugin = require('kolibri-tools/lib/webpackRtlPlugin');
 
 const { InjectManifest } = require('workbox-webpack-plugin');
 
-// Function to detect if running in WSL
+const DEFAULT_WEBPACK_DEV_HOST = '127.0.0.1';
+
+/**
+ * Function to detect if running in WSL
+ * @return {boolean}
+ */
 function isWSL() {
   try {
     const version = fs.readFileSync('/proc/version', 'utf8');
@@ -26,14 +31,24 @@ function isWSL() {
   }
 }
 
-// Function to get WSL IP address
-function getWSLIP() {
+/**
+ * Get the host for the webpack dev server.
+ * @return {string}
+ */
+function getWebpackDevHost() {
+  if (process.env.WEBPACK_DEV_HOST) {
+    return process.env.WEBPACK_DEV_HOST;
+  }
+
+  if (!isWSL()) {
+    return DEFAULT_WEBPACK_DEV_HOST;
+  }
+
   try {
-    const ip = execSync('hostname -I').toString().trim().split(' ')[0];
-    return ip;
+    return execSync('hostname -I').toString().trim().split(' ')[0];
   } catch (err) {
     console.warn('Failed to get WSL IP address:', err);
-    return '127.0.0.1';
+    return DEFAULT_WEBPACK_DEV_HOST;
   }
 }
 
@@ -60,11 +75,8 @@ module.exports = (env = {}) => {
   const pnpmNodeModules = path.join(rootDir, 'node_modules', '.pnpm', 'node_modules');
 
   // Determine the appropriate dev server host and public path based on environment
-  const isWSLEnvironment = isWSL();
-  const devServerHost = isWSLEnvironment ? '0.0.0.0' : '127.0.0.1';
-  const devPublicPath = isWSLEnvironment ?
-    `http://${getWSLIP()}:4000/dist/` :
-    'http://127.0.0.1:4000/dist/';
+  const devServerHost = getWebpackDevHost();
+  const devPublicPath = `http://${devServerHost}:4000/dist/`;
 
   const workboxPlugin = new InjectManifest({
     swSrc: path.resolve(srcDir, 'serviceWorker/index.js'),

From 8557480e29995471d7c31f24505ad20fc06d56df Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 15:11:49 -0700
Subject: [PATCH 3/8] Improve logging and progress output, perform bulk
 handling of assessments

---
 .../management/commands/restore_channel.py    |   1 -
 .../contentcuration/utils/import_tools.py     | 207 +++++++++++-------
 requirements-dev.in                           |   1 +
 requirements-dev.txt                          |   2 +
 4 files changed, 127 insertions(+), 84 deletions(-)

diff --git a/contentcuration/contentcuration/management/commands/restore_channel.py b/contentcuration/contentcuration/management/commands/restore_channel.py
index 49b785f725..16b3976228 100644
--- a/contentcuration/contentcuration/management/commands/restore_channel.py
+++ b/contentcuration/contentcuration/management/commands/restore_channel.py
@@ -53,7 +53,6 @@ def add_arguments(self, parser):
         )
 
     def handle(self, *args, **options):
-        logger.info("\n\n********** STARTING CHANNEL RESTORATION **********")
         manager = ImportManager(
             options["source_url"],
             options["source_id"],
diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index 875291d84b..e5e0414f33 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -11,6 +11,7 @@
 from io import BytesIO
 
 import requests
+import tqdm
 from django.core.files.storage import default_storage
 from django.core.management import call_command
 from django.db import transaction
@@ -180,7 +181,7 @@ def __init__(
         self.client = ImportClient(source_url, api_token=token)
         self.conn = None
         self.cursor = None
-        self.schema_version = None
+        self.progress = None
 
     @cached_property
     def editor_user(self):
@@ -193,10 +194,10 @@ def editor_user(self):
 
     def run(self):
         """
-        Run the import process.
+        Run the import restoration process.
         """
+        self.logger.info("********** STARTING CHANNEL RESTORATION **********")
         # Set up variables for the import process
-        self.logger.info("\n\n********** STARTING CHANNEL IMPORT **********")
         start = datetime.datetime.now()
 
         if not self.token:
@@ -242,10 +243,38 @@ def run(self):
                 complete=True,
             )
 
+            self.logger.info("Creating nodes...")
+            total_nodes = self.cursor.execute(
+                f"SELECT COUNT(*) FROM {NODE_TABLE}"
+            ).fetchone()[0]
+            node_progress = tqdm.tqdm(
+                total=total_nodes, desc="Restoring nodes", unit="node"
+            )
+
             # Create nodes mapping to channel
-            self.logger.info("   Creating nodes...")
             with transaction.atomic():
-                self._create_nodes(root)
+                self._create_nodes(root, node_progress)
+                node_progress.close()
+                self.logger.info("Creating assessment items...")
+                exercise_nodes = models.ContentNode.objects.filter(
+                    kind_id=content_kinds.EXERCISE, tree_id=root.tree_id
+                )
+                exercise_progress = tqdm.tqdm(
+                    total=exercise_nodes.count(),
+                    desc="Restoring assessments",
+                    unit="node",
+                )
+                chunk = []
+                for node in exercise_nodes.iterator(chunk_size=10):
+                    chunk.append(node)
+                    if len(chunk) >= 10:
+                        self._create_assessment_items(chunk)
+                        exercise_progress.update(len(chunk))
+                        chunk = []
+                if chunk:
+                    self._create_assessment_items(chunk)
+                    exercise_progress.update(len(chunk))
+                exercise_progress.close()
                 # TODO: Handle prerequisites
 
             # Delete the previous tree if it exists
@@ -271,9 +300,9 @@ def run(self):
 
         # Print stats
         self.logger.info(
-            f"\n\nChannel has been imported (time: {datetime.datetime.now() - start})\n"
+            f"Channel has been imported (time: {datetime.datetime.now() - start})"
         )
-        self.logger.info("\n\n********** IMPORT COMPLETE **********\n\n")
+        self.logger.info("********** IMPORT COMPLETE **********")
 
     def _create_channel(self):
         """
@@ -314,15 +343,15 @@ def _create_channel(self):
         channel.version = version
         channel.public = self.public
         channel.save()
-        self.logger.info(f"\tCreated channel {self.target_id} with name {name}")
+        self.logger.info(f"Created channel {self.target_id} with name {name}")
         return channel, root_pk
 
-    def _create_nodes(self, parent, indent=1):
+    def _create_nodes(self, parent, progress):
         """
         Create node(s) for a channel with target id
 
         :param parent: node's parent
-        :param indent: How far to indent print statements
+        :param progress: progress bar for node creation
         """
         sql_command = f"""
             SELECT
@@ -359,12 +388,6 @@ def _create_nodes(self, parent, indent=1):
             duration,
             options,
         ) in query:
-            self.logger.info(
-                "{indent} {id} ({title} - {kind})...".format(
-                    indent="   |" * indent, id=id, title=title, kind=kind
-                )
-            )
-
             # Determine role
             role = roles.LEARNER
             if coach_content:
@@ -440,16 +463,16 @@ def _create_nodes(self, parent, indent=1):
 
             # Handle foreign key references (children, files, tags)
             if kind == content_kinds.TOPIC:
-                self._create_nodes(node, indent=indent + 1)
-            elif kind == content_kinds.EXERCISE:
-                self._create_assessment_items(node, indent=indent + 1)
-            self._create_files(node, indent=indent + 1)
-            self._create_tags(node, indent=indent + 1)
-
-            errors = node.mark_complete()
-            if errors:
-                self.logger.warning(f"Node {node.node_id} has errors: {errors}")
+                self._create_nodes(node, progress)
+            self._create_files(node)
+            self._create_tags(node)
+
+            if kind != content_kinds.EXERCISE:
+                errors = node.mark_complete()
+                if errors:
+                    self.logger.warning(f"Node {node.node_id} has errors: {errors}")
             node.save()
+            progress.update(1)
 
     def _retrieve_license(self, license_id):
         """
@@ -474,12 +497,11 @@ def _retrieve_license(self, license_id):
         ).fetchone()
         return models.License.objects.get(license_name=name), description
 
-    def _create_files(self, contentnode, indent=0):
+    def _create_files(self, contentnode):
         """
         Create and possibly download node files
 
         :param contentnode: node file references
-        :param indent: How far to indent print statements
         """
         # Parse database for files referencing content node and make file models
         sql_command = f"""
@@ -499,11 +521,6 @@ def _create_files(self, contentnode, indent=0):
             is_thumbnail,
         ) in query:
             filename = "{}.{}".format(checksum, extension)
-            self.logger.info(
-                "{indent} * FILE {filename}...".format(
-                    indent="   |" * indent, filename=filename
-                )
-            )
 
             try:
                 self._download_file(
@@ -595,104 +612,128 @@ def _download_file(
         # set_by_file_on_disk: skip unless the file has been downloaded
         file_obj.save(set_by_file_on_disk=file_exists)
 
-    def _create_tags(self, contentnode, indent=0):
+    def _create_tags(self, contentnode):
         """
         Create tags associated with node
 
         :param contentnode: node tags reference
-        :param indent: How far to indent print statements
         """
         # Parse database for files referencing content node and make file models
         sql_command = f"""
-            SELECT ct.id, ct.tag_name
+            SELECT ct.tag_name
             FROM {NODE_TAG_TABLE} cnt
             JOIN {TAG_TABLE} ct ON cnt.contenttag_id = ct.id
             WHERE cnt.contentnode_id = ?;
         """
         query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
 
-        # Build up list of tags
-        tag_list = []
-        for id, tag_name in query:
-            self.logger.info(
-                "{indent} ** TAG {tag}...".format(indent="   |" * indent, tag=tag_name)
-            )
-            # Save values to new or existing tag object
-            tag_obj, is_new = models.ContentTag.objects.get_or_create(
-                pk=id,
-                tag_name=tag_name,
-                channel_id=self.target_id,
-            )
-            tag_list.append(tag_obj)
+        models.ContentTag.objects.bulk_create(
+            [
+                models.ContentTag(
+                    tag_name=tag_name,
+                    channel_id=self.target_id,
+                )
+                for tag_name in query
+            ],
+            ignore_conflicts=True,
+        )
 
         # Save tags to node
-        contentnode.tags.set(tag_list)
+        contentnode.tags.set(
+            models.ContentTag.objects.filter(
+                tag_name__in=query, channel_id=self.target_id
+            )
+        )
         contentnode.save()
 
-    def _create_assessment_items(self, contentnode, indent=0):
+    def _create_assessment_items(self, nodes):
         """
-        Generate assessment items based on perseus zip
+        Generate assessment items based on API data
 
-        :param contentnode: node assessment items reference
-        :param indent: How far to indent print statements
+        :param nodes: nodes to lookup assessment items
         """
+        # Note: there are several different IDs being used within this method
+        node_ids = [node.node_id for node in nodes]
+
         if not self.token:
             self.logger.warning(
-                f"Skipping assessment items for node {contentnode.node_id}"
+                f"Skipping assessment items for node(s) {','. join(node_ids)}"
             )
             return
 
-        # first obtain the content node's Studio ID with the node ID
-        node_response = self.client.get_with_token(
-            f"/api/contentnode?_node_id_channel_id___in={contentnode.node_id},{self.source_id}"
+        # first obtain the remote nodes' IDs with the node ID and channel ID
+        node_channel_ids = f",{self.source_id},".join(node_ids)
+        nodes_response = self.client.get_with_token(
+            f"/api/contentnode?_node_id_channel_id___in={node_channel_ids},{self.source_id}"
         )
-        if node_response.status_code != 200:
+        if nodes_response.status_code != 200:
             self.logger.warning(
-                f"Failed to obtain assessment items for node {contentnode.node_id}"
+                f"Failed to obtain assessment items for node(s) {','. join(node_ids)}"
             )
             return
 
-        node_data = node_response.json()
-        contentnode_id = node_data[0]["id"] if node_data else None
-        if not contentnode_id:
-            self.logger.warning(f"No content node found for node {contentnode.node_id}")
+        nodes_data = nodes_response.json()
+        remote_node_pks = [n["id"] for n in nodes_data] if nodes_data else None
+
+        if not remote_node_pks:
+            self.logger.warning(
+                f"No content node found for node(s) {','. join(node_ids)}"
+            )
             return
 
         # Get the content node's assessment items
         assessment_response = self.client.get_with_token(
-            f"/api/assessmentitem?contentnode__in={contentnode_id}"
+            f"/api/assessmentitem?contentnode__in={','.join(remote_node_pks)}"
         )
         if assessment_response.status_code != 200:
             self.logger.warning(
-                f"Failed to obtain assessment items for node {contentnode.node_id}"
+                f"Failed to obtain assessment items for node(s) {','. join(node_ids)}"
             )
             return
 
         assessment_items = assessment_response.json()
         if not assessment_items:
             self.logger.warning(
-                f"No assessment items found for node {contentnode.node_id}"
+                f"No assessment items found for node(s) {','. join(node_ids)}"
             )
             return
 
-        # Create the assessment items
-        for item in assessment_items:
-            self.logger.info(
-                "{indent} ** ASSESSMENT ITEM {assessment_id}...".format(
-                    indent="   |" * indent, assessment_id=item["assessment_id"]
+        remote_node_pk_map = (
+            {n["node_id"]: n["id"] for n in nodes_data} if nodes_data else {}
+        )
+
+        for local_node in nodes:
+            remote_contentnode_id = remote_node_pk_map.get(local_node.node_id)
+            reduced_assessment_items = [
+                item
+                for item in assessment_items
+                if item["contentnode"] == remote_contentnode_id
+            ]
+
+            if not reduced_assessment_items:
+                self.logger.warning(
+                    f"No assessment items found for node {local_node.node_id}"
                 )
-            )
-            assessment_item = models.AssessmentItem.objects.create(
-                assessment_id=item["assessment_id"],
-                type=item["type"],
-                order=item["order"],
-                question=item["question"],
-                answers=item["answers"],
-                hints=item["hints"],
-                randomize=item.get("randomize", False),
-            )
-            contentnode.assessment_items.add(assessment_item)
-        contentnode.save()
+                continue
+
+            for item in reduced_assessment_items:
+                assessment_item = models.AssessmentItem.objects.create(
+                    assessment_id=item["assessment_id"],
+                    type=item["type"],
+                    order=item["order"],
+                    question=item["question"],
+                    answers=item["answers"],
+                    hints=item["hints"],
+                    raw_data=item["raw_data"],
+                    source_url=item["source_url"],
+                    randomize=item.get("randomize", False),
+                )
+                self._process_assessment_images(assessment_item)
+                local_node.assessment_items.add(assessment_item)
+            errors = local_node.mark_complete()
+            if errors:
+                self.logger.warning(f"Node {local_node.node_id} has errors: {errors}")
+            local_node.save()
 
     def _process_assessment_images(self, assessment_item):
         """
diff --git a/requirements-dev.in b/requirements-dev.in
index a21653e2dd..e22ba298fb 100644
--- a/requirements-dev.in
+++ b/requirements-dev.in
@@ -9,3 +9,4 @@ pre-commit==4.2.0
 nodeenv
 pip-tools==7.4.1
 drf-yasg==1.21.10
+tqdm
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ad0b5b5815..ef1d0adfdd 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -109,6 +109,8 @@ tomli==1.2.3
     #   build
     #   pip-tools
     #   pytest
+tqdm==4.67.1
+    # via -r requirements-dev.in
 uritemplate==3.0.1
     # via drf-yasg
 virtualenv==20.26.6

From 43eb687f7f86e8b4131d7ada03c4a8e38117791c Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 15:43:31 -0700
Subject: [PATCH 4/8] Local composite storage handling allowing read-only
 access to cloud bucket

---
 .../commands/set_content_mimetypes.py         |  2 +-
 .../contentcuration/production_settings.py    |  2 +-
 contentcuration/contentcuration/settings.py   |  2 +-
 .../test_storage.py}                          | 10 +--
 .../contentcuration/utils/cloud_storage.py    | 40 ----------
 .../contentcuration/utils/storage/__init__.py |  0
 .../contentcuration/utils/storage/base.py     | 76 +++++++++++++++++++
 .../{storage_common.py => storage/common.py}  |  7 +-
 .../contentcuration/utils/storage/dev.py      | 17 +++++
 .../utils/{gcs_storage.py => storage/gcs.py}  | 66 ++--------------
 .../contentcuration/viewsets/file.py          |  2 +-
 11 files changed, 112 insertions(+), 112 deletions(-)
 rename contentcuration/contentcuration/tests/{test_storage_common.py => utils/test_storage.py} (95%)
 delete mode 100644 contentcuration/contentcuration/utils/cloud_storage.py
 create mode 100644 contentcuration/contentcuration/utils/storage/__init__.py
 create mode 100644 contentcuration/contentcuration/utils/storage/base.py
 rename contentcuration/contentcuration/utils/{storage_common.py => storage/common.py} (96%)
 create mode 100644 contentcuration/contentcuration/utils/storage/dev.py
 rename contentcuration/contentcuration/utils/{gcs_storage.py => storage/gcs.py} (80%)

diff --git a/contentcuration/contentcuration/management/commands/set_content_mimetypes.py b/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
index 732d64f8d6..3d408b783e 100755
--- a/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
+++ b/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
@@ -14,7 +14,7 @@
 from django.core.files.storage import default_storage
 from django.core.management.base import BaseCommand
 
-from contentcuration.utils.storage_common import determine_content_type
+from contentcuration.utils.storage.common import determine_content_type
 
 
 class Command(BaseCommand):
diff --git a/contentcuration/contentcuration/production_settings.py b/contentcuration/contentcuration/production_settings.py
index 969ff4dca4..0412178d98 100644
--- a/contentcuration/contentcuration/production_settings.py
+++ b/contentcuration/contentcuration/production_settings.py
@@ -10,7 +10,7 @@
 
 MEDIA_ROOT = base_settings.STORAGE_ROOT
 
-DEFAULT_FILE_STORAGE = 'contentcuration.utils.gcs_storage.CompositeGCS'
+DEFAULT_FILE_STORAGE = 'contentcuration.utils.storage.gcs.CompositeGCS'
 SESSION_ENGINE = "django.contrib.sessions.backends.db"
 
 # email settings
diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py
index a911055032..be6a53d20e 100644
--- a/contentcuration/contentcuration/settings.py
+++ b/contentcuration/contentcuration/settings.py
@@ -361,7 +361,7 @@ def gettext(s):
 ORPHAN_DATE_CLEAN_UP_THRESHOLD = TWO_WEEKS_AGO
 
 # CLOUD STORAGE SETTINGS
-DEFAULT_FILE_STORAGE = 'django_s3_storage.storage.S3Storage'
+DEFAULT_FILE_STORAGE = 'contentcuration.utils.storage.dev.CompositeStorage'
 AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID') or 'development'
 AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') or 'development'
 AWS_S3_BUCKET_NAME = os.getenv('AWS_BUCKET_NAME') or 'content'
diff --git a/contentcuration/contentcuration/tests/test_storage_common.py b/contentcuration/contentcuration/tests/utils/test_storage.py
similarity index 95%
rename from contentcuration/contentcuration/tests/test_storage_common.py
rename to contentcuration/contentcuration/tests/utils/test_storage.py
index 29ad9f59c9..94de6e6450 100644
--- a/contentcuration/contentcuration/tests/test_storage_common.py
+++ b/contentcuration/contentcuration/tests/utils/test_storage.py
@@ -10,12 +10,12 @@
 from django_s3_storage.storage import S3Storage
 from mock import MagicMock
 
-from .base import StudioTestCase
+from ..base import StudioTestCase
 from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.storage_common import _get_gcs_presigned_put_url
-from contentcuration.utils.storage_common import determine_content_type
-from contentcuration.utils.storage_common import get_presigned_upload_url
-from contentcuration.utils.storage_common import UnknownStorageBackendError
+from contentcuration.utils.storage.common import _get_gcs_presigned_put_url
+from contentcuration.utils.storage.common import determine_content_type
+from contentcuration.utils.storage.common import get_presigned_upload_url
+from contentcuration.utils.storage.common import UnknownStorageBackendError
 # The modules we'll test
 
 
diff --git a/contentcuration/contentcuration/utils/cloud_storage.py b/contentcuration/contentcuration/utils/cloud_storage.py
deleted file mode 100644
index bf60b51bb3..0000000000
--- a/contentcuration/contentcuration/utils/cloud_storage.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from automation.utils.appnexus.base import Backend
-from automation.utils.appnexus.base import BackendFactory
-from automation.utils.appnexus.base import BackendRequest
-from automation.utils.appnexus.base import BackendResponse
-
-
-class CloudStorageBackendRequest(BackendRequest):
-    pass
-
-
-class CloudStorageRequest(CloudStorageBackendRequest):
-    def __init__(self) -> None:
-        super().__init__()
-
-
-class CloudStorageBackendResponse(BackendResponse):
-    pass
-
-
-class CloudStorageResponse(CloudStorageBackendResponse):
-    def __init__(self) -> None:
-        pass
-
-
-class CloudStorageBackendFactory(BackendFactory):
-    def create_backend(self) -> Backend:
-        return super().create_backend()
-
-
-class CloudStorage(Backend):
-
-    def connect(self) -> None:
-        return super().connect()
-
-    def make_request(self, request) -> CloudStorageResponse:
-        return super().make_request(request)
-
-    @classmethod
-    def _create_instance(cls) -> 'CloudStorage':
-        return cls()
diff --git a/contentcuration/contentcuration/utils/storage/__init__.py b/contentcuration/contentcuration/utils/storage/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/contentcuration/contentcuration/utils/storage/base.py b/contentcuration/contentcuration/utils/storage/base.py
new file mode 100644
index 0000000000..9e38b1413e
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/base.py
@@ -0,0 +1,76 @@
+from django.core.files.storage import Storage as BaseStorage
+
+
+class Storage(BaseStorage):
+    def writeable(self):
+        """
+        :rtype: bool
+        """
+        return True
+
+    def get_client(self):
+        """
+        :rtype: object
+        """
+        return None
+
+
+class CompositeStorage(Storage):
+    def __init__(self):
+        self.backends = []
+
+    def _get_writeable_backend(self):
+        """
+        :rtype: Storage
+        """
+        for backend in self.backends:
+            if backend.writeable:
+                return backend
+        raise AssertionError("No writeable backend found")
+
+    def _get_readable_backend(self, name):
+        """
+        :rtype: Storage
+        """
+        for backend in self.backends:
+            if backend.exists(name):
+                return backend
+        raise FileNotFoundError("{} not found".format(name))
+
+    def get_client(self):
+        return self._get_writeable_backend().get_client()
+
+    def open(self, name, mode='rb'):
+        return self._get_readable_backend(name).open(name, mode)
+
+    def save(self, name, content, max_length=None):
+        return self._get_writeable_backend().save(name, content, max_length=max_length)
+
+    def delete(self, name):
+        self._get_writeable_backend().delete(name)
+
+    def exists(self, name):
+        try:
+            self._get_readable_backend(name)
+            return True
+        except FileNotFoundError:
+            return False
+
+    def listdir(self, path):
+        # This method was not implemented on GoogleCloudStorage to begin with
+        raise NotImplementedError("listdir is not implemented for CompositeStorage")
+
+    def size(self, name):
+        return self._get_readable_backend(name).size(name)
+
+    def url(self, name):
+        return self._get_readable_backend(name).url(name)
+
+    def get_accessed_time(self, name):
+        return self._get_readable_backend(name).get_accessed_time(name)
+
+    def get_created_time(self, name):
+        return self._get_readable_backend(name).get_created_time(name)
+
+    def get_modified_time(self, name):
+        return self._get_readable_backend(name).get_modified_time(name)
diff --git a/contentcuration/contentcuration/utils/storage_common.py b/contentcuration/contentcuration/utils/storage/common.py
similarity index 96%
rename from contentcuration/contentcuration/utils/storage_common.py
rename to contentcuration/contentcuration/utils/storage/common.py
index f2ba6e3188..a7f284f677 100644
--- a/contentcuration/contentcuration/utils/storage_common.py
+++ b/contentcuration/contentcuration/utils/storage/common.py
@@ -6,8 +6,9 @@
 from django.core.files.storage import default_storage
 from django_s3_storage.storage import S3Storage
 
-from .gcs_storage import CompositeGCS
-from .gcs_storage import GoogleCloudStorage
+from .base import CompositeStorage
+from .gcs import CompositeGCS
+from .gcs import GoogleCloudStorage
 
 
 # Do this to ensure that we infer mimetypes for files properly, specifically
@@ -66,7 +67,7 @@ def get_presigned_upload_url(
         client = client or storage.get_client()
         bucket = settings.AWS_S3_BUCKET_NAME
         upload_url = _get_gcs_presigned_put_url(client, bucket, filepath, md5sum_b64, lifetime_sec, mimetype=mimetype)
-    elif isinstance(storage, S3Storage):
+    elif isinstance(storage, (S3Storage, CompositeStorage)):
         bucket = settings.AWS_S3_BUCKET_NAME
         client = client or storage.s3_connection
         upload_url = _get_s3_presigned_put_url(client, bucket, filepath, md5sum_b64, lifetime_sec)
diff --git a/contentcuration/contentcuration/utils/storage/dev.py b/contentcuration/contentcuration/utils/storage/dev.py
new file mode 100644
index 0000000000..6f6dca7458
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/dev.py
@@ -0,0 +1,17 @@
+from django_s3_storage.storage import S3Storage
+from google.cloud.storage import Client
+
+from contentcuration.utils.storage.base import CompositeStorage as BaseCompositeStorage
+from contentcuration.utils.storage.base import Storage as BaseStorage
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
+
+
+class Storage(S3Storage, BaseStorage):
+    pass
+
+
+class CompositeStorage(BaseCompositeStorage):
+    def __init__(self):
+        super(CompositeStorage, self).__init__()
+        self.backends.append(Storage())
+        self.backends.append(GoogleCloudStorage(Client.create_anonymous_client(), "studio-content"))
diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/storage/gcs.py
similarity index 80%
rename from contentcuration/contentcuration/utils/gcs_storage.py
rename to contentcuration/contentcuration/utils/storage/gcs.py
index 9ec21a3886..c603e4be34 100644
--- a/contentcuration/contentcuration/utils/gcs_storage.py
+++ b/contentcuration/contentcuration/utils/storage/gcs.py
@@ -6,11 +6,13 @@
 import backoff
 from django.conf import settings
 from django.core.files import File
-from django.core.files.storage import Storage
 from google.cloud.exceptions import InternalServerError
 from google.cloud.storage import Client
 from google.cloud.storage.blob import Blob
 
+from contentcuration.utils.storage.base import CompositeStorage
+from contentcuration.utils.storage.base import Storage
+
 OLD_STUDIO_STORAGE_PREFIX = "/contentworkshop_content/"
 
 CONTENT_DATABASES_MAX_AGE = 5  # seconds
@@ -120,7 +122,7 @@ def save(self, name, fobj, max_length=None, blob_object=None):
 
         # determine the current file's mimetype based on the name
         # import determine_content_type lazily in here, so we don't get into an infinite loop with circular dependencies
-        from contentcuration.utils.storage_common import determine_content_type
+        from contentcuration.utils.storage.common import determine_content_type
         content_type = determine_content_type(name)
 
         # force the current file to be at file location 0, to
@@ -212,66 +214,10 @@ def _is_file_empty(fobj):
         return len(byt) == 0
 
 
-class CompositeGCS(Storage):
+class CompositeGCS(CompositeStorage):
     def __init__(self):
-        self.backends = []
+        super(CompositeGCS, self).__init__()
         self.backends.append(GoogleCloudStorage(_create_default_client(), settings.AWS_S3_BUCKET_NAME))
         # Only add the studio-content bucket (the production bucket) if we're not in production
         if settings.SITE_ID != settings.PRODUCTION_SITE_ID:
             self.backends.append(GoogleCloudStorage(Client.create_anonymous_client(), "studio-content"))
-
-    def _get_writeable_backend(self):
-        """
-        :rtype: GoogleCloudStorage
-        """
-        for backend in self.backends:
-            if backend.writeable:
-                return backend
-        raise AssertionError("No writeable backend found")
-
-    def _get_readable_backend(self, name):
-        """
-        :rtype: GoogleCloudStorage
-        """
-        for backend in self.backends:
-            if backend.exists(name):
-                return backend
-        raise FileNotFoundError("{} not found".format(name))
-
-    def get_client(self):
-        return self._get_writeable_backend().get_client()
-
-    def open(self, name, mode='rb'):
-        return self._get_readable_backend(name).open(name, mode)
-
-    def save(self, name, content, max_length=None):
-        return self._get_writeable_backend().save(name, content, max_length=max_length)
-
-    def delete(self, name):
-        self._get_writeable_backend().delete(name)
-
-    def exists(self, name):
-        try:
-            self._get_readable_backend(name)
-            return True
-        except FileNotFoundError:
-            return False
-
-    def listdir(self, path):
-        # This method was not implemented on GoogleCloudStorage to begin with
-        raise NotImplementedError("listdir is not implemented for CompositeGCS")
-
-    def size(self, name):
-        return self._get_readable_backend(name).size(name)
-
-    def url(self, name):
-        return self._get_readable_backend(name).url(name)
-
-    def get_accessed_time(self, name):
-        return self._get_readable_backend(name).get_accessed_time(name)
-
-    def get_created_time(self, name):
-        return self._get_readable_backend(name).get_created_time(name)
-
-    def get_modified_time(self, name):
-        return self._get_readable_backend(name).get_modified_time(name)
diff --git a/contentcuration/contentcuration/viewsets/file.py b/contentcuration/contentcuration/viewsets/file.py
index f73f0557f2..c60997acb5 100644
--- a/contentcuration/contentcuration/viewsets/file.py
+++ b/contentcuration/contentcuration/viewsets/file.py
@@ -18,7 +18,7 @@
 from contentcuration.models import generate_storage_url
 from contentcuration.utils.cache import ResourceSizeCache
 from contentcuration.utils.sentry import report_exception
-from contentcuration.utils.storage_common import get_presigned_upload_url
+from contentcuration.utils.storage.common import get_presigned_upload_url
 from contentcuration.utils.user import calculate_user_storage
 from contentcuration.viewsets.base import BulkDeleteMixin
 from contentcuration.viewsets.base import BulkListSerializer

From 586f0896bdb26f8f05eaa2a58be40b6cc7900884 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Tue, 6 May 2025 15:46:48 -0700
Subject: [PATCH 5/8] Move and update tests and imports

---
 .../contentcuration/sandbox_settings.py            |  2 +-
 .../tests/utils/test_cloud_storage.py              | 10 ----------
 .../tests/{ => utils}/test_gcs_storage.py          | 14 +++++++-------
 .../contentcuration/utils/storage/common.py        | 10 ++++++----
 .../contentcuration/utils/storage/dev.py           |  6 +++++-
 5 files changed, 19 insertions(+), 23 deletions(-)
 delete mode 100644 contentcuration/contentcuration/tests/utils/test_cloud_storage.py
 rename contentcuration/contentcuration/tests/{ => utils}/test_gcs_storage.py (95%)

diff --git a/contentcuration/contentcuration/sandbox_settings.py b/contentcuration/contentcuration/sandbox_settings.py
index 61e00a465f..912fed7244 100644
--- a/contentcuration/contentcuration/sandbox_settings.py
+++ b/contentcuration/contentcuration/sandbox_settings.py
@@ -3,7 +3,7 @@
 
 DEBUG = True
 
-DEFAULT_FILE_STORAGE = "contentcuration.utils.gcs_storage.CompositeGCS"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.gcs.CompositeGCS"
 
 LANGUAGES += (("ar", gettext("Arabic")),)  # noqa
 
diff --git a/contentcuration/contentcuration/tests/utils/test_cloud_storage.py b/contentcuration/contentcuration/tests/utils/test_cloud_storage.py
deleted file mode 100644
index 5d84fd9f10..0000000000
--- a/contentcuration/contentcuration/tests/utils/test_cloud_storage.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from django.test import TestCase
-
-from contentcuration.utils.cloud_storage import CloudStorage
-
-
-class CloudStorageTestCase(TestCase):
-    def test_backend_initialization(self):
-        cloud_storage_instance = CloudStorage()
-        self.assertIsNotNone(cloud_storage_instance)
-        self.assertIsInstance(cloud_storage_instance, CloudStorage)
diff --git a/contentcuration/contentcuration/tests/test_gcs_storage.py b/contentcuration/contentcuration/tests/utils/test_gcs_storage.py
similarity index 95%
rename from contentcuration/contentcuration/tests/test_gcs_storage.py
rename to contentcuration/contentcuration/tests/utils/test_gcs_storage.py
index 165877f9ac..07feb0bdb8 100755
--- a/contentcuration/contentcuration/tests/test_gcs_storage.py
+++ b/contentcuration/contentcuration/tests/utils/test_gcs_storage.py
@@ -8,8 +8,8 @@
 from google.cloud.storage.blob import Blob
 from mixer.main import mixer
 
-from contentcuration.utils.gcs_storage import CompositeGCS
-from contentcuration.utils.gcs_storage import GoogleCloudStorage
+from contentcuration.utils.storage.gcs import CompositeGCS
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
 
 
 class GoogleCloudStorageSaveTestCase(TestCase):
@@ -70,8 +70,8 @@ def test_uploads_cache_control_private_if_content_database(self):
         self.storage.save(filename, self.content, blob_object=self.blob_obj)
         assert "private" in self.blob_obj.cache_control
 
-    @mock.patch("contentcuration.utils.gcs_storage.BytesIO")
-    @mock.patch("contentcuration.utils.gcs_storage.GoogleCloudStorage._is_file_empty", return_value=False)
+    @mock.patch("contentcuration.utils.storage.gcs.BytesIO")
+    @mock.patch("contentcuration.utils.storage.gcs.GoogleCloudStorage._is_file_empty", return_value=False)
     def test_gzip_if_content_database(self, bytesio_mock, file_empty_mock):
         """
         Check that if we're uploading a gzipped content database and
@@ -147,8 +147,8 @@ def setUp(self):
         self.mock_anon_bucket = bucket_cls(self.mock_anon_client, "bucket")
         self.mock_anon_client.get_bucket.return_value = self.mock_anon_bucket
 
-        with mock.patch("contentcuration.utils.gcs_storage._create_default_client", return_value=self.mock_default_client), \
-             mock.patch("contentcuration.utils.gcs_storage.Client.create_anonymous_client", return_value=self.mock_anon_client):
+        with mock.patch("contentcuration.utils.storage.gcs._create_default_client", return_value=self.mock_default_client), \
+             mock.patch("contentcuration.utils.storage.gcs.Client.create_anonymous_client", return_value=self.mock_anon_client):
             self.storage = CompositeGCS()
 
     def test_get_writeable_backend(self):
@@ -177,7 +177,7 @@ def test_open(self):
         self.assertIsInstance(f, File)
         self.mock_default_bucket.get_blob.assert_called_with("blob")
 
-    @mock.patch("contentcuration.utils.gcs_storage.Blob")
+    @mock.patch("contentcuration.utils.storage.gcs.Blob")
     def test_save(self, mock_blob):
         self.storage.save("blob", BytesIO(b"content"))
         blob = mock_blob.return_value
diff --git a/contentcuration/contentcuration/utils/storage/common.py b/contentcuration/contentcuration/utils/storage/common.py
index a7f284f677..babd60abcf 100644
--- a/contentcuration/contentcuration/utils/storage/common.py
+++ b/contentcuration/contentcuration/utils/storage/common.py
@@ -7,6 +7,7 @@
 from django_s3_storage.storage import S3Storage
 
 from .base import CompositeStorage
+from .base import Storage
 from .gcs import CompositeGCS
 from .gcs import GoogleCloudStorage
 
@@ -63,13 +64,14 @@ def get_presigned_upload_url(
     # both storage types are having difficulties enforcing it.
 
     mimetype = determine_content_type(filepath)
-    if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
+    bucket = settings.AWS_S3_BUCKET_NAME
+
+    if isinstance(storage, Storage):
         client = client or storage.get_client()
-        bucket = settings.AWS_S3_BUCKET_NAME
+
+    if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
         upload_url = _get_gcs_presigned_put_url(client, bucket, filepath, md5sum_b64, lifetime_sec, mimetype=mimetype)
     elif isinstance(storage, (S3Storage, CompositeStorage)):
-        bucket = settings.AWS_S3_BUCKET_NAME
-        client = client or storage.s3_connection
         upload_url = _get_s3_presigned_put_url(client, bucket, filepath, md5sum_b64, lifetime_sec)
     else:
         raise UnknownStorageBackendError(
diff --git a/contentcuration/contentcuration/utils/storage/dev.py b/contentcuration/contentcuration/utils/storage/dev.py
index 6f6dca7458..c5f1ba55fb 100644
--- a/contentcuration/contentcuration/utils/storage/dev.py
+++ b/contentcuration/contentcuration/utils/storage/dev.py
@@ -7,7 +7,11 @@
 
 
 class Storage(S3Storage, BaseStorage):
-    pass
+    def get_client(self):
+        """
+        :rtype: object
+        """
+        return self.s3_connection
 
 
 class CompositeStorage(BaseCompositeStorage):

From 16638f6af1eee9c05151d99be309212a228b946d Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 7 May 2025 07:37:38 -0700
Subject: [PATCH 6/8] Consolidate presigned URL handling into storage arch

---
 .../tests/utils/test_storage.py               | 46 +++++-----
 .../contentcuration/utils/storage/base.py     | 23 ++++-
 .../contentcuration/utils/storage/common.py   | 90 ++-----------------
 .../contentcuration/utils/storage/dev.py      | 36 +++++++-
 .../contentcuration/utils/storage/gcs.py      | 48 +++++++++-
 .../contentcuration/viewsets/file.py          |  2 +-
 6 files changed, 130 insertions(+), 115 deletions(-)

diff --git a/contentcuration/contentcuration/tests/utils/test_storage.py b/contentcuration/contentcuration/tests/utils/test_storage.py
index 94de6e6450..b4c0e0db20 100644
--- a/contentcuration/contentcuration/tests/utils/test_storage.py
+++ b/contentcuration/contentcuration/tests/utils/test_storage.py
@@ -7,16 +7,15 @@
 import requests
 from django.core.files.storage import FileSystemStorage
 from django.test import TestCase
-from django_s3_storage.storage import S3Storage
 from mock import MagicMock
 
 from ..base import StudioTestCase
 from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.storage.common import _get_gcs_presigned_put_url
 from contentcuration.utils.storage.common import determine_content_type
 from contentcuration.utils.storage.common import get_presigned_upload_url
 from contentcuration.utils.storage.common import UnknownStorageBackendError
-# The modules we'll test
+from contentcuration.utils.storage.dev import Storage as DevStorage
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
 
 
 class MimeTypesTestCase(TestCase):
@@ -77,7 +76,10 @@ def test_raises_error(self):
         """
         with pytest.raises(UnknownStorageBackendError):
             get_presigned_upload_url(
-                "nice", "err", 5, 0, storage=self.STORAGE,
+                "nice",
+                "err",
+                5,
+                storage=self.STORAGE,
             )
 
 
@@ -90,7 +92,9 @@ class GoogleCloudStoragePresignedURLUnitTestCase(TestCase):
     """
 
     def setUp(self):
+        super().setUp()
         self.client = MagicMock()
+        self.storage = GoogleCloudStorage(self.client, "fake")
         self.generate_signed_url_method = (
             self.client.get_bucket.return_value.blob.return_value.generate_signed_url
         )
@@ -102,19 +106,15 @@ def test_that_generate_signed_url_is_called(self):
         """
         Check that we even call blob.generate_signed_url in the first place.
         """
-        bucket = "fake"
-        _get_gcs_presigned_put_url(self.client, bucket, "/object.jpg", "aBc", 0, 0)
+        get_presigned_upload_url("/object.jpg", "aBc", 0, storage=self.storage)
         self.generate_signed_url_method.assert_called_once()
 
     def test_that_we_return_a_string(self):
         """
         Check that _get_gcs_presigned_put_url returns a string.
         """
-        bucket = "fake"
-        ret = _get_gcs_presigned_put_url(
-            self.client, bucket, "/object.jpg", "aBc", 0, 0
-        )
-        assert isinstance(ret, str)
+        ret = get_presigned_upload_url("/object.jpg", "aBc", 0, storage=self.storage)
+        assert isinstance(ret["uploadURL"], str)
 
     def test_generate_signed_url_called_with_required_arguments(self):
         """
@@ -132,11 +132,9 @@ def test_generate_signed_url_called_with_required_arguments(self):
         bucket_name = "fake"
         filepath = "object.jpg"
         lifetime = 20  # seconds
-        mimetype = "doesntmatter"
+        mimetype = "image/jpeg"
 
-        _get_gcs_presigned_put_url(
-            self.client, bucket_name, filepath, content_md5, lifetime, mimetype
-        )
+        get_presigned_upload_url(filepath, content_md5, lifetime, storage=self.storage)
 
         # assert that we're creating the right object
         self.client.get_bucket.assert_called_once_with(bucket_name)
@@ -148,8 +146,8 @@ def test_generate_signed_url_called_with_required_arguments(self):
         self.generate_signed_url_method.assert_called_once_with(
             method=method,
             content_md5=content_md5,
-            expiration=lifetime_timedelta,
             content_type=mimetype,
+            expiration=lifetime_timedelta,
         )
 
 
@@ -158,11 +156,9 @@ class S3StoragePresignedURLUnitTestCase(StudioTestCase):
     Test cases for generating presigned URLs for S3 storage, i.e. Minio.
     """
 
-    STORAGE = S3Storage()
-
     def setUp(self):
-        self.client = MagicMock()
         super().setUp()
+        self.storage = DevStorage()
 
     def test_returns_string_if_inputs_are_valid(self):
         """
@@ -171,9 +167,7 @@ def test_returns_string_if_inputs_are_valid(self):
         """
 
         # use a real connection here as a sanity check
-        ret = get_presigned_upload_url(
-            "a/b/abc.jpg", "aBc", 10, 1, storage=self.STORAGE, client=None
-        )
+        ret = get_presigned_upload_url("a/b/abc.jpg", "aBc", 10, storage=self.storage)
         url = ret["uploadURL"]
 
         assert isinstance(url, str)
@@ -187,12 +181,14 @@ def test_can_upload_file_to_presigned_url(self):
         # S3 expects a base64-encoded MD5 checksum
         md5 = hashlib.md5(file_contents)
         md5_checksum = md5.hexdigest()
-        md5_checksum_base64 = codecs.encode(codecs.decode(md5_checksum, "hex"), "base64").decode()
+        md5_checksum_base64 = codecs.encode(
+            codecs.decode(md5_checksum, "hex"), "base64"
+        ).decode()
 
         filename = "blahfile.jpg"
         filepath = generate_object_storage_name(md5_checksum, filename)
 
-        ret = get_presigned_upload_url(filepath, md5_checksum_base64, 1000, len(file_contents))
+        ret = get_presigned_upload_url(filepath, md5_checksum_base64, 1000)
         url = ret["uploadURL"]
         content_type = ret["mimetype"]
 
@@ -201,6 +197,6 @@ def test_can_upload_file_to_presigned_url(self):
             data=file,
             headers={
                 "Content-Type": content_type,
-            }
+            },
         )
         resp.raise_for_status()
diff --git a/contentcuration/contentcuration/utils/storage/base.py b/contentcuration/contentcuration/utils/storage/base.py
index 9e38b1413e..a78e54153f 100644
--- a/contentcuration/contentcuration/utils/storage/base.py
+++ b/contentcuration/contentcuration/utils/storage/base.py
@@ -14,6 +14,20 @@ def get_client(self):
         """
         return None
 
+    def get_presigned_put_url(
+        self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+    ):
+        """
+        Creates a pre-signed URL for uploading files.
+
+        :param filepath: A string representing the destination file path inside the bucket
+        :param md5sum: A MD5 checksum of the file to be uploaded
+        :param lifetime_sec: The lifetime of the URL in seconds
+        :param mimetype: The content type of the file to be uploaded
+        :return: A pre-signed URL for uploading the file
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
 
 class CompositeStorage(Storage):
     def __init__(self):
@@ -40,7 +54,7 @@ def _get_readable_backend(self, name):
     def get_client(self):
         return self._get_writeable_backend().get_client()
 
-    def open(self, name, mode='rb'):
+    def open(self, name, mode="rb"):
         return self._get_readable_backend(name).open(name, mode)
 
     def save(self, name, content, max_length=None):
@@ -74,3 +88,10 @@ def get_created_time(self, name):
 
     def get_modified_time(self, name):
         return self._get_readable_backend(name).get_modified_time(name)
+
+    def get_presigned_put_url(
+        self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+    ):
+        return self._get_writeable_backend().get_presigned_put_url(
+            filepath, md5sum, lifetime_sec, mimetype=mimetype
+        )
diff --git a/contentcuration/contentcuration/utils/storage/common.py b/contentcuration/contentcuration/utils/storage/common.py
index babd60abcf..f48fd0da2d 100644
--- a/contentcuration/contentcuration/utils/storage/common.py
+++ b/contentcuration/contentcuration/utils/storage/common.py
@@ -1,15 +1,10 @@
 import mimetypes
 import os
-from datetime import timedelta
 
-from django.conf import settings
 from django.core.files.storage import default_storage
-from django_s3_storage.storage import S3Storage
 
 from .base import CompositeStorage
 from .base import Storage
-from .gcs import CompositeGCS
-from .gcs import GoogleCloudStorage
 
 
 # Do this to ensure that we infer mimetypes for files properly, specifically
@@ -38,9 +33,10 @@ def determine_content_type(filename):
 
 
 def get_presigned_upload_url(
-    filepath, md5sum_b64, lifetime_sec, content_length, storage=default_storage, client=None
+    filepath, md5sum_b64, lifetime_sec, storage=default_storage
 ):
-    """Return a presigned URL that can modify the given filepath through a PUT
+    """
+    Return a presigned URL that can modify the given filepath through a PUT
     request. Performing a PUT request on the returned URL changes the object's
     contents with the contents of your PUT request.
 
@@ -49,9 +45,6 @@ def get_presigned_upload_url(
     have to set a Content-MD5 HTTP header matching this md5sum once it
     initiates the download.
     :param: lifetime_sec: the lifetime of the generated upload url, in seconds.
-    :param: content_length: the size of the content, in bytes.
-    :param: client: the storage client that will be used to gennerate the presigned URL.
-    This must have an API that's similar to either the GCS client or the boto3 client.
 
     :returns: a dictionary containing 2 keys:
         mimetype: the mimetype that will be required to send as part of the file upload's mimetype header
@@ -59,82 +52,15 @@ def get_presigned_upload_url(
 
     :raises: :class:`UnknownStorageBackendError`: If the storage backend is not S3 or GCS.
     """
-
-    # Aron: note that content_length is not used right now because
-    # both storage types are having difficulties enforcing it.
-
     mimetype = determine_content_type(filepath)
-    bucket = settings.AWS_S3_BUCKET_NAME
-
-    if isinstance(storage, Storage):
-        client = client or storage.get_client()
 
-    if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
-        upload_url = _get_gcs_presigned_put_url(client, bucket, filepath, md5sum_b64, lifetime_sec, mimetype=mimetype)
-    elif isinstance(storage, (S3Storage, CompositeStorage)):
-        upload_url = _get_s3_presigned_put_url(client, bucket, filepath, md5sum_b64, lifetime_sec)
+    if isinstance(storage, (Storage, CompositeStorage)):
+        upload_url = storage.get_presigned_put_url(
+            filepath, md5sum_b64, lifetime_sec, mimetype=mimetype
+        )
     else:
         raise UnknownStorageBackendError(
             "Please ensure your storage backend is either Google Cloud Storage or S3 Storage!"
         )
 
-    return {
-        "mimetype": mimetype,
-        "uploadURL": upload_url
-    }
-
-
-def _get_gcs_presigned_put_url(gcs_client, bucket, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"):
-    bucket_obj = gcs_client.get_bucket(bucket)
-    blob_obj = bucket_obj.blob(filepath)
-
-    # ensure the md5sum doesn't have any whitespace, including newlines.
-    # We should do the same whitespace stripping as well on any client that actually
-    # uses the returned presigned url.
-    md5sum_stripped = md5sum.strip()
-
-    # convert the lifetime to a timedelta, so gcloud library will interpret the lifetime
-    # as the seconds from right now. If we use an absolute integer, it's the number of seconds
-    # from unix time
-    lifetime_timedelta = timedelta(seconds=lifetime_sec)
-
-    url = blob_obj.generate_signed_url(
-        method="PUT",
-        content_md5=md5sum_stripped,
-        content_type=mimetype,
-        expiration=lifetime_timedelta,
-    )
-
-    return url
-
-
-def _get_s3_presigned_put_url(s3_client, bucket, filepath, md5sum, lifetime_sec):
-    """
-    Creates a pre-signed URL for S3-like backends, e.g. Minio.
-
-    Note that since our production object storage backend is GCS, we do not enforce or require
-    any Content-MD5 value.
-
-    :param: s3_client: an initialized S3 client. We will use this to create the presigned PUT url.
-    :param: bucket: the bucket where the user can PUT their object.
-    :param: filepath: the file path inside the bucket that the user can PUT their object.
-    :param: md5sum: the base64-encoded MD5sum of the object the user is planning to PUT.
-        This is ignored for this function and added solely to maintain API compatibility with other
-        private presigned URL functions.
-    :param: lifetime_sec: how long before the presigned URL expires, in seconds.
-    """
-    # S3's PUT Object parameters:
-    # https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html
-    method = "put_object"
-    fields = {
-        "Bucket": bucket,
-        "Key": filepath,
-    }
-
-    response = s3_client.generate_presigned_url(
-        ClientMethod=method,
-        Params=fields,
-        ExpiresIn=lifetime_sec,
-    )
-
-    return response
+    return {"mimetype": mimetype, "uploadURL": upload_url}
diff --git a/contentcuration/contentcuration/utils/storage/dev.py b/contentcuration/contentcuration/utils/storage/dev.py
index c5f1ba55fb..7e77a6e305 100644
--- a/contentcuration/contentcuration/utils/storage/dev.py
+++ b/contentcuration/contentcuration/utils/storage/dev.py
@@ -1,3 +1,4 @@
+from django.conf import settings
 from django_s3_storage.storage import S3Storage
 from google.cloud.storage import Client
 
@@ -9,13 +10,44 @@
 class Storage(S3Storage, BaseStorage):
     def get_client(self):
         """
-        :rtype: object
+        :rtype: botocore.client.BaseClient
         """
         return self.s3_connection
 
+    def get_presigned_put_url(self, filepath, md5sum, lifetime_sec, mimetype=None):
+        """
+        Creates a pre-signed URL for development storage backends
+
+        Note that since our production object storage backend is GCS, we do not enforce or require
+        any Content-MD5 value.
+
+        :param: filepath: the file path inside the bucket that the user can PUT their object.
+        :param: md5sum: the base64-encoded MD5sum of the object the user is planning to PUT.
+            This is ignored for this function and added solely to maintain API compatibility with other
+            private presigned URL functions.
+        :param: lifetime_sec: how long before the presigned URL expires, in seconds.
+        :param: mimetype: the content type of the file to be uploaded
+        :return: A pre-signed URL for uploading the file
+        """
+        # S3's PUT Object parameters:
+        # https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html
+        method = "put_object"
+        fields = {
+            "Bucket": settings.AWS_S3_BUCKET_NAME,
+            "Key": filepath,
+        }
+
+        return self.get_client().generate_presigned_url(
+            ClientMethod=method,
+            Params=fields,
+            ExpiresIn=lifetime_sec,
+        )
+
 
 class CompositeStorage(BaseCompositeStorage):
     def __init__(self):
         super(CompositeStorage, self).__init__()
         self.backends.append(Storage())
-        self.backends.append(GoogleCloudStorage(Client.create_anonymous_client(), "studio-content"))
+        self.backends.append(
+            GoogleCloudStorage(Client.create_anonymous_client(), "studio-content")
+        )
diff --git a/contentcuration/contentcuration/utils/storage/gcs.py b/contentcuration/contentcuration/utils/storage/gcs.py
index c603e4be34..38e0347d2d 100644
--- a/contentcuration/contentcuration/utils/storage/gcs.py
+++ b/contentcuration/contentcuration/utils/storage/gcs.py
@@ -1,5 +1,6 @@
 import logging
 import tempfile
+from datetime import timedelta
 from gzip import GzipFile
 from io import BytesIO
 
@@ -20,7 +21,9 @@
 MAX_RETRY_TIME = 60  # seconds
 
 
-def _create_default_client(service_account_credentials_path=settings.GCS_STORAGE_SERVICE_ACCOUNT_KEY_PATH):
+def _create_default_client(
+    service_account_credentials_path=settings.GCS_STORAGE_SERVICE_ACCOUNT_KEY_PATH,
+):
     if service_account_credentials_path:
         return Client.from_service_account_json(service_account_credentials_path)
     return Client()
@@ -123,6 +126,7 @@ def save(self, name, fobj, max_length=None, blob_object=None):
         # determine the current file's mimetype based on the name
         # import determine_content_type lazily in here, so we don't get into an infinite loop with circular dependencies
         from contentcuration.utils.storage.common import determine_content_type
+
         content_type = determine_content_type(name)
 
         # force the current file to be at file location 0, to
@@ -134,7 +138,8 @@ def save(self, name, fobj, max_length=None, blob_object=None):
             return name
 
         blob.upload_from_file(
-            fobj, content_type=content_type,
+            fobj,
+            content_type=content_type,
         )
 
         # Close StringIO object and discard memory buffer if created
@@ -213,11 +218,46 @@ def _is_file_empty(fobj):
             fobj.seek(current_location)
         return len(byt) == 0
 
+    def get_presigned_put_url(
+        self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+    ):
+        """
+        Creates a pre-signed URL for GCS.
+
+        :param filepath: A string representing the destination file path inside the bucket
+        :param md5sum: A MD5 checksum of the file to be uploaded
+        :param lifetime_sec: The lifetime of the URL in seconds
+        :param mimetype: The content type of the file to be uploaded
+        :return: A pre-signed URL for uploading the file
+        """
+        blob_obj = self.bucket.blob(filepath)
+
+        # ensure the md5sum doesn't have any whitespace, including newlines.
+        # We should do the same whitespace stripping as well on any client that actually
+        # uses the returned presigned url.
+        md5sum_stripped = md5sum.strip()
+
+        # convert the lifetime to a timedelta, so gcloud library will interpret the lifetime
+        # as the seconds from right now. If we use an absolute integer, it's the number of seconds
+        # from unix time
+        lifetime_timedelta = timedelta(seconds=lifetime_sec)
+
+        return blob_obj.generate_signed_url(
+            method="PUT",
+            content_md5=md5sum_stripped,
+            content_type=mimetype,
+            expiration=lifetime_timedelta,
+        )
+
 
 class CompositeGCS(CompositeStorage):
     def __init__(self):
         super(CompositeGCS, self).__init__()
-        self.backends.append(GoogleCloudStorage(_create_default_client(), settings.AWS_S3_BUCKET_NAME))
+        self.backends.append(
+            GoogleCloudStorage(_create_default_client(), settings.AWS_S3_BUCKET_NAME)
+        )
         # Only add the studio-content bucket (the production bucket) if we're not in production
         if settings.SITE_ID != settings.PRODUCTION_SITE_ID:
-            self.backends.append(GoogleCloudStorage(Client.create_anonymous_client(), "studio-content"))
+            self.backends.append(
+                GoogleCloudStorage(Client.create_anonymous_client(), "studio-content")
+            )
diff --git a/contentcuration/contentcuration/viewsets/file.py b/contentcuration/contentcuration/viewsets/file.py
index c60997acb5..35ba9157f5 100644
--- a/contentcuration/contentcuration/viewsets/file.py
+++ b/contentcuration/contentcuration/viewsets/file.py
@@ -236,7 +236,7 @@ def upload_url(self, request):
             codecs.decode(checksum, "hex"), "base64"
         ).decode()
         retval = get_presigned_upload_url(
-            filepath, checksum_base64, 600, content_length=size
+            filepath, checksum_base64, 600
         )
 
         file = File(

From 06a5f9cb213135698895e9e8bad559ac7ad26955 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 7 May 2025 08:50:42 -0700
Subject: [PATCH 7/8] Speed up restoration by avoiding read-only GCS storage

---
 .../contentcuration/utils/import_tools.py         | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index e5e0414f33..559427753a 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -32,6 +32,7 @@
 from contentcuration.utils.files import write_base64_to_file
 from contentcuration.utils.garbage_collect import get_deleted_chefs_root
 from contentcuration.utils.publish import publish_channel
+from contentcuration.utils.storage.base import CompositeStorage
 from contentcuration.viewsets.assessmentitem import exercise_image_filename_regex
 
 CHANNEL_TABLE = "content_channelmetadata"
@@ -59,6 +60,11 @@ def __init__(self, base_url, api_token=None):
         super(ImportClient, self).__init__()
         self.base_url = base_url
         self.api_token = api_token
+        self.headers.update(
+            {
+                "User-Agent": f"restore_channel/kolibri-studio/dev python-requests/{requests.__version__}",
+            }
+        )
 
     def __getattr__(self, name):
         if name.endswith("_with_token"):
@@ -179,6 +185,11 @@ def __init__(
         self.download_content = download_content
         self.logger = logger or logging.getLogger(__name__)
         self.client = ImportClient(source_url, api_token=token)
+        self.storage = (
+            default_storage._get_writeable_backend()
+            if isinstance(default_storage, CompositeStorage)
+            else default_storage
+        )
         self.conn = None
         self.cursor = None
         self.progress = None
@@ -568,8 +579,8 @@ def _download_file(
         file_exists = False
 
         # If the file already exists, get the size from the storage
-        if default_storage.exists(filepath):
-            file_size = file_size or default_storage.size(filepath)
+        if self.storage.exists(filepath):
+            file_size = file_size or self.storage.size(filepath)
             file_exists = True
         # if it needs downloading and if we were instructed to do so
         elif self.download_content or (is_thumbnail and contentnode):

From 4b785072bc638ddf2e160f0ef59f4c0a7a5787c6 Mon Sep 17 00:00:00 2001
From: Blaine Jester <blainesworld@gmail.com>
Date: Wed, 7 May 2025 11:45:21 -0700
Subject: [PATCH 8/8] More defensive and thorough handling of completion and
 mastery criteria

---
 .../contentcuration/utils/import_tools.py     | 101 +++++++++++-------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index 559427753a..57d6d6e76a 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -276,9 +276,9 @@ def run(self):
                     unit="node",
                 )
                 chunk = []
-                for node in exercise_nodes.iterator(chunk_size=10):
+                for node in exercise_nodes.iterator(chunk_size=20):
                     chunk.append(node)
-                    if len(chunk) >= 10:
+                    if len(chunk) >= 20:
                         self._create_assessment_items(chunk)
                         exercise_progress.update(len(chunk))
                         chunk = []
@@ -404,44 +404,11 @@ def _create_nodes(self, parent, progress):
             if coach_content:
                 role = roles.COACH
 
-            # Determine extra_fields
-            extra_fields = {}
-            if kind == content_kinds.EXERCISE:
-                randomize_sql = f"""
-                    SELECT randomize
-                    FROM {ASSESSMENTMETADATA_TABLE}
-                    WHERE contentnode_id = ?
-                """
-                randomize = self.cursor.execute(randomize_sql, (id,)).fetchone()
-                extra_fields["options"] = json.loads(options) if options else {}
-                extra_fields["randomize"] = bool(randomize[0]) if randomize else False
-                completion_criteria_ = extra_fields["options"].get(
-                    "completion_criteria"
-                )
-                if (
-                    completion_criteria_
-                    and completion_criteria_.get("model") == completion_criteria.MASTERY
-                ):
-                    mastery_model = completion_criteria_.get("threshold", {}).get(
-                        "mastery_model"
-                    )
-                    if mastery_model == mastery_criteria.DO_ALL:
-                        completion_criteria_["threshold"] = {
-                            "mastery_model": mastery_model,
-                        }
-                if (
-                    completion_criteria_
-                    and "learner_managed" not in completion_criteria_
-                ):
-                    completion_criteria_["learner_managed"] = False
-
             # Determine license
             license_result = self._retrieve_license(license_id)
             license_description = license_result[1] if license_result else ""
             license_result = license_result[0] if license_result else None
 
-            # TODO: Determine thumbnail encoding
-
             # Create the new node model
             node = models.ContentNode.objects.create(
                 node_id=id,
@@ -457,7 +424,7 @@ def _create_nodes(self, parent, progress):
                 license_description=license_description,
                 language_id=lang_id,
                 role_visibility=role,
-                extra_fields=extra_fields,
+                extra_fields=self._prepare_node_extra_fields(id, kind, options),
                 kind_id=kind,
                 parent=parent,
                 original_channel_id=self.target_id,
@@ -478,6 +445,8 @@ def _create_nodes(self, parent, progress):
             self._create_files(node)
             self._create_tags(node)
 
+            # assessments are handled after all nodes are created, which also ensures nodes
+            # are marked complete
             if kind != content_kinds.EXERCISE:
                 errors = node.mark_complete()
                 if errors:
@@ -485,6 +454,64 @@ def _create_nodes(self, parent, progress):
             node.save()
             progress.update(1)
 
+    def _prepare_node_extra_fields(self, node_id, kind, options):
+        """
+        Prepare extra fields for the node based on the kind and options. For exercises, it
+        retrieves the additional info from the assessment metadata.
+
+        :param node_id: the node ID
+        :param kind: the content kind
+        :param options: the options JSON string
+        :return: a dictionary of extra fields
+        """
+        extra_fields = {
+            "options": json.loads(options) if options else {},
+        }
+        completion_criteria_ = extra_fields["options"].get("completion_criteria", {})
+
+        # don't fill anything in if there is no completion_criteria, otherwise validation will fail
+        if completion_criteria_ and "learner_managed" not in completion_criteria_:
+            completion_criteria_.update(learner_managed=False)
+
+        if kind == content_kinds.EXERCISE:
+            randomize_sql = f"""
+                SELECT randomize, mastery_model
+                FROM {ASSESSMENTMETADATA_TABLE}
+                WHERE contentnode_id = ?
+            """
+            randomize, mastery_criteria_ = self.cursor.execute(
+                randomize_sql, (node_id,)
+            ).fetchone()
+            extra_fields["randomize"] = bool(randomize) if randomize else False
+            if mastery_criteria_:
+                mastery_criteria_ = json.loads(mastery_criteria_)
+                mastery_criteria_.update(mastery_model=mastery_criteria_.pop("type"))
+                completion_criteria_.update(
+                    {
+                        "model": completion_criteria.MASTERY,
+                        "threshold": mastery_criteria_,
+                    }
+                )
+
+        if completion_criteria_.get("model") == completion_criteria.MASTERY:
+            mastery_model = completion_criteria_.get("threshold", {}).get(
+                "mastery_model"
+            )
+            if mastery_model in [
+                mastery_criteria.DO_ALL,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_2,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_3,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_5,
+                mastery_criteria.NUM_CORRECT_IN_A_ROW_10,
+            ]:
+                # remove m,n values
+                completion_criteria_["threshold"] = {
+                    "mastery_model": mastery_model,
+                }
+
+        extra_fields["options"].update(completion_criteria=completion_criteria_)
+        return extra_fields
+
     def _retrieve_license(self, license_id):
         """
         Get license based on id from exported db
@@ -543,7 +570,7 @@ def _create_files(self, contentnode):
                     is_thumbnail=is_thumbnail,
                 )
             except IOError as e:
-                self.logger.warning("\b FAILED (check logs for more details)")
+                self.logger.warning(f"FAILED to download '{filename}': {str(e)}")
                 if e.errno:
                     sys.stderr.write(
                         f"Restoration Process Error: Failed to save file object {filename}: {os.strerror(e.errno)}"