From e19fedb304f7333197010c07ed6c1dd5be1661fe Mon Sep 17 00:00:00 2001 From: ozer550 Date: Thu, 11 Dec 2025 12:07:42 +0530 Subject: [PATCH 1/3] remove perseus from storage calculations --- contentcuration/contentcuration/models.py | 43 +++++++++------ .../contentcuration/tests/test_files.py | 55 +++++++++++++++++++ 2 files changed, 82 insertions(+), 16 deletions(-) diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index 32727f0159..dfef8df461 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -349,20 +349,20 @@ def check_feature_flag(self, flag_name): def check_channel_space(self, channel): tree_cte = With(self.get_user_active_trees().distinct(), name="trees") - files_cte = With( - tree_cte.join( - self.files.get_queryset(), contentnode__tree_id=tree_cte.col.tree_id - ) - .values("checksum") - .distinct(), - name="files", + channel_files_qs = tree_cte.join( + self.files.get_queryset(), contentnode__tree_id=tree_cte.col.tree_id ) + channel_files_qs = self._filter_storage_billable_files(channel_files_qs) + files_cte = With(channel_files_qs.values("checksum").distinct(), name="files") - staging_tree_files = ( + staging_tree_files = self._filter_storage_billable_files( self.files.filter(contentnode__tree_id=channel.staging_tree.tree_id) - .with_cte(tree_cte) - .with_cte(files_cte) - .exclude(Exists(files_cte.queryset().filter(checksum=OuterRef("checksum")))) + ) + staging_tree_files = staging_tree_files.with_cte(tree_cte).with_cte(files_cte) + staging_tree_files = ( + staging_tree_files.exclude( + Exists(files_cte.queryset().filter(checksum=OuterRef("checksum"))) + ) .values("checksum") .distinct() ) @@ -412,11 +412,22 @@ def get_user_active_trees(self): def get_user_active_files(self): cte = With(self.get_user_active_trees().distinct()) - return ( - cte.join(self.files.get_queryset(), contentnode__tree_id=cte.col.tree_id) - .with_cte(cte) - .values("checksum") - .distinct() + files_qs = cte.join( + self.files.get_queryset(), contentnode__tree_id=cte.col.tree_id + ).with_cte(cte) + + files_qs = self._filter_storage_billable_files(files_qs) + + return files_qs.values("checksum").distinct() + + def _filter_storage_billable_files(self, queryset): + """ + Perseus exports would not be included in storage calculations. + """ + if queryset is None: + return queryset + return queryset.exclude(file_format_id__isnull=True).exclude( + file_format_id=file_formats.PERSEUS ) def get_space_used(self, active_files=None): diff --git a/contentcuration/contentcuration/tests/test_files.py b/contentcuration/contentcuration/tests/test_files.py index 125dcbc48b..7e0b11a573 100755 --- a/contentcuration/contentcuration/tests/test_files.py +++ b/contentcuration/contentcuration/tests/test_files.py @@ -9,6 +9,7 @@ from django.db.models import Exists from django.db.models import OuterRef from le_utils.constants import content_kinds +from le_utils.constants import file_formats from mock import patch from .base import BaseAPITestCase @@ -232,3 +233,57 @@ def test_check_staged_space__exists(self): ) as get_available_staged_space: get_available_staged_space.return_value = 0 self.assertTrue(self.user.check_staged_space(100, f.checksum)) + + def test_check_channel_space_ignores_perseus_exports(self): + with mock.patch("contentcuration.utils.user.calculate_user_storage"): + self.node_file.file_format_id = file_formats.PERSEUS + self.node_file.file_size = self.user.disk_space + 1 + self.node_file.checksum = uuid4().hex + self.node_file.uploaded_by = self.user + self.node_file.save(set_by_file_on_disk=False) + + try: + self.user.check_channel_space(self.staged_channel) + except PermissionDenied: + self.fail("Perseus exports should not count against staging space") + + +class UserStorageUsageTestCase(StudioTestCase): + def setUp(self): + super().setUpBase() + self.contentnode = ( + self.channel.main_tree.get_descendants(include_self=True) + .filter(files__isnull=False) + .first() + ) + self.assertIsNotNone(self.contentnode) + self.base_file = self.contentnode.files.first() + self.assertIsNotNone(self.base_file) + + def _create_file(self, *, file_format, size): + file_record = File( + contentnode=self.contentnode, + checksum=uuid4().hex, + file_format_id=file_format, + file_size=size, + uploaded_by=self.user, + ) + file_record.save(set_by_file_on_disk=False) + return file_record + + def test_get_space_used_excludes_perseus_exports(self): + baseline_usage = self.user.get_space_used() + + perseus_size = 125 + with mock.patch("contentcuration.utils.user.calculate_user_storage"): + self._create_file(file_format=file_formats.PERSEUS, size=perseus_size) + self.assertEqual(self.user.get_space_used(), baseline_usage) + + non_perseus_size = 275 + with mock.patch("contentcuration.utils.user.calculate_user_storage"): + self._create_file( + file_format=self.base_file.file_format_id, size=non_perseus_size + ) + + expected_usage = baseline_usage + non_perseus_size + self.assertEqual(self.user.get_space_used(), expected_usage) From eaba50497ff882a063845c6cf10e07e1935579ef Mon Sep 17 00:00:00 2001 From: ozer550 Date: Fri, 19 Dec 2025 14:27:55 +0530 Subject: [PATCH 2/3] optimise query by shuffling cte order --- contentcuration/contentcuration/models.py | 84 ++++++++++++++++++----- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index dfef8df461..23adf07846 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -349,25 +349,57 @@ def check_feature_flag(self, flag_name): def check_channel_space(self, channel): tree_cte = With(self.get_user_active_trees().distinct(), name="trees") - channel_files_qs = tree_cte.join( - self.files.get_queryset(), contentnode__tree_id=tree_cte.col.tree_id + + user_files_cte = With( + self.files.get_queryset() + .values("checksum", "contentnode_id", "file_format_id") + .distinct(), + name="user_files", + ) + + editable_files_qs = ( + user_files_cte.queryset() + .with_cte(tree_cte) + .with_cte(user_files_cte) + .filter( + Exists( + tree_cte.join( + ContentNode.objects.all(), tree_id=tree_cte.col.tree_id + ) + .with_cte(tree_cte) + .filter(id=OuterRef("contentnode_id")) + ) + ) + ) + + editable_files_qs = self._filter_storage_billable_files(editable_files_qs) + + existing_checksums_cte = With( + editable_files_qs.values("checksum").distinct(), name="existing_checksums" ) - channel_files_qs = self._filter_storage_billable_files(channel_files_qs) - files_cte = With(channel_files_qs.values("checksum").distinct(), name="files") - staging_tree_files = self._filter_storage_billable_files( + staging_files_qs = self._filter_storage_billable_files( self.files.filter(contentnode__tree_id=channel.staging_tree.tree_id) ) - staging_tree_files = staging_tree_files.with_cte(tree_cte).with_cte(files_cte) - staging_tree_files = ( - staging_tree_files.exclude( - Exists(files_cte.queryset().filter(checksum=OuterRef("checksum"))) + + staging_files_qs = ( + staging_files_qs.with_cte(tree_cte) + .with_cte(user_files_cte) + .with_cte(existing_checksums_cte) + .exclude( + Exists( + existing_checksums_cte.queryset().filter( + checksum=OuterRef("checksum") + ) + ) ) - .values("checksum") - .distinct() ) + staged_size = float( - staging_tree_files.aggregate(used=Sum("file_size"))["used"] or 0 + staging_files_qs.values("checksum") + .distinct() + .aggregate(used=Sum("file_size"))["used"] + or 0 ) if self.get_available_space() < staged_size: @@ -410,13 +442,31 @@ def get_user_active_trees(self): ) def get_user_active_files(self): - cte = With(self.get_user_active_trees().distinct()) - files_qs = cte.join( - self.files.get_queryset(), contentnode__tree_id=cte.col.tree_id - ).with_cte(cte) + tree_cte = With(self.get_user_active_trees().distinct(), name="trees") + + user_files_cte = With( + self.files.get_queryset() + .values("checksum", "contentnode_id", "file_format_id") + .distinct(), + name="user_files", + ) + file_qs = ( + user_files_cte.queryset() + .with_cte(tree_cte) + .with_cte(user_files_cte) + .filter( + Exists( + tree_cte.join( + ContentNode.objects.all(), tree_id=tree_cte.col.tree_id + ) + .with_cte(tree_cte) + .filter(id=OuterRef("contentnode_id")) + ) + ) + ) - files_qs = self._filter_storage_billable_files(files_qs) + files_qs = self._filter_storage_billable_files(file_qs) return files_qs.values("checksum").distinct() From d509c379507abf6b92d3ea4a05b721d59414b0c3 Mon Sep 17 00:00:00 2001 From: ozer550 Date: Thu, 1 Jan 2026 09:48:45 +0530 Subject: [PATCH 3/3] fix failing tests --- contentcuration/contentcuration/models.py | 52 +++++++++++++++++------ 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index 23adf07846..bdd9ccedca 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -351,9 +351,14 @@ def check_channel_space(self, channel): tree_cte = With(self.get_user_active_trees().distinct(), name="trees") user_files_cte = With( - self.files.get_queryset() - .values("checksum", "contentnode_id", "file_format_id") - .distinct(), + self.files.get_queryset().only( + "id", + "checksum", + "contentnode_id", + "file_format_id", + "file_size", + "preset_id", + ), name="user_files", ) @@ -364,7 +369,8 @@ def check_channel_space(self, channel): .filter( Exists( tree_cte.join( - ContentNode.objects.all(), tree_id=tree_cte.col.tree_id + ContentNode.objects.only("id", "tree_id"), + tree_id=tree_cte.col.tree_id, ) .with_cte(tree_cte) .filter(id=OuterRef("contentnode_id")) @@ -395,10 +401,15 @@ def check_channel_space(self, channel): ) ) + unique_staging_ids = ( + staging_files_qs.order_by("checksum", "id") + .distinct("checksum") + .values("id") + ) staged_size = float( - staging_files_qs.values("checksum") - .distinct() - .aggregate(used=Sum("file_size"))["used"] + staging_files_qs.filter(id__in=Subquery(unique_staging_ids)).aggregate( + used=Sum("file_size") + )["used"] or 0 ) @@ -446,19 +457,26 @@ def get_user_active_files(self): tree_cte = With(self.get_user_active_trees().distinct(), name="trees") user_files_cte = With( - self.files.get_queryset() - .values("checksum", "contentnode_id", "file_format_id") - .distinct(), + self.files.get_queryset().only( + "id", + "checksum", + "contentnode_id", + "file_format_id", + "file_size", + "preset_id", + ), name="user_files", ) - file_qs = ( + + base_files_qs = ( user_files_cte.queryset() .with_cte(tree_cte) .with_cte(user_files_cte) .filter( Exists( tree_cte.join( - ContentNode.objects.all(), tree_id=tree_cte.col.tree_id + ContentNode.objects.only("id", "tree_id"), + tree_id=tree_cte.col.tree_id, ) .with_cte(tree_cte) .filter(id=OuterRef("contentnode_id")) @@ -466,9 +484,15 @@ def get_user_active_files(self): ) ) - files_qs = self._filter_storage_billable_files(file_qs) + base_files_qs = self._filter_storage_billable_files(base_files_qs) + + unique_file_ids = ( + base_files_qs.order_by("checksum", "id").distinct("checksum").values("id") + ) + + files_qs = base_files_qs.filter(id__in=Subquery(unique_file_ids)) - return files_qs.values("checksum").distinct() + return files_qs def _filter_storage_billable_files(self, queryset): """