From b48f54269c715c732bfddad1d2ed69c86810f27b Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Wed, 6 Aug 2025 20:36:26 +0200 Subject: [PATCH 1/5] Attempt to move Pageview deletion to a background task The test was failing locally with IntegrityErrors, so I'm not sure if this is a good approach or not. --- readthedocs/analytics/apps.py | 4 ++ .../analytics/migrations/0008_no_cascade.py | 39 ++++++++++++++++ readthedocs/analytics/models.py | 4 +- readthedocs/analytics/tasks/__init__.py | 0 readthedocs/analytics/tasks/pageviews.py | 16 +++++++ .../analytics/tests/test_pageview_cleanup.py | 45 +++++++++++++++++++ readthedocs/projects/models.py | 4 ++ 7 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 readthedocs/analytics/migrations/0008_no_cascade.py create mode 100644 readthedocs/analytics/tasks/__init__.py create mode 100644 readthedocs/analytics/tasks/pageviews.py create mode 100644 readthedocs/analytics/tests/test_pageview_cleanup.py diff --git a/readthedocs/analytics/apps.py b/readthedocs/analytics/apps.py index 6293f36771e..5614e0b66ca 100644 --- a/readthedocs/analytics/apps.py +++ b/readthedocs/analytics/apps.py @@ -9,3 +9,7 @@ class AnalyticsAppConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "readthedocs.analytics" verbose_name = "Analytics" + + def ready(self): + # Import tasks to ensure they are registered + pass diff --git a/readthedocs/analytics/migrations/0008_no_cascade.py b/readthedocs/analytics/migrations/0008_no_cascade.py new file mode 100644 index 00000000000..783ce120087 --- /dev/null +++ b/readthedocs/analytics/migrations/0008_no_cascade.py @@ -0,0 +1,39 @@ +# Generated by Django 5.2.3 on 2025-08-06 18:28 + +import django.db.models.deletion +from django.db import migrations +from django.db import models +from django_safemigrate import Safe + + +class Migration(migrations.Migration): + dependencies = [ + ("analytics", "0007_index_on_pageview_date"), + ("builds", "0064_healthcheck"), + ("projects", "0152_create_gh_app_integration"), + ] + + safe = Safe.before_deploy() + + operations = [ + migrations.AlterField( + model_name="pageview", + name="project", + field=models.ForeignKey( + on_delete=django.db.models.deletion.DO_NOTHING, + related_name="page_views", + to="projects.project", + ), + ), + migrations.AlterField( + model_name="pageview", + name="version", + field=models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.DO_NOTHING, + related_name="page_views", + to="builds.version", + verbose_name="Version", + ), + ), + ] diff --git a/readthedocs/analytics/models.py b/readthedocs/analytics/models.py index 58813ce325e..81247892de1 100644 --- a/readthedocs/analytics/models.py +++ b/readthedocs/analytics/models.py @@ -59,7 +59,7 @@ class PageView(models.Model): project = models.ForeignKey( Project, related_name="page_views", - on_delete=models.CASCADE, + on_delete=models.DO_NOTHING, ) # NOTE: this could potentially be removed, # since isn't being used and not all page @@ -68,7 +68,7 @@ class PageView(models.Model): Version, verbose_name=_("Version"), related_name="page_views", - on_delete=models.CASCADE, + on_delete=models.DO_NOTHING, null=True, ) path = models.CharField( diff --git a/readthedocs/analytics/tasks/__init__.py b/readthedocs/analytics/tasks/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/readthedocs/analytics/tasks/pageviews.py b/readthedocs/analytics/tasks/pageviews.py new file mode 100644 index 00000000000..b6b15f97e71 --- /dev/null +++ b/readthedocs/analytics/tasks/pageviews.py @@ -0,0 +1,16 @@ +import structlog +from celery import shared_task + +from readthedocs.analytics.models import PageView + + +log = structlog.get_logger(__name__) + + +@shared_task(queue="web") +def delete_project_pageviews(project_slug): + """ + Delete all PageView objects for a given project slug. + """ + count, _ = PageView.objects.filter(project__slug=project_slug).delete() + log.info("Deleted PageViews for project", project_slug=project_slug, count=count) diff --git a/readthedocs/analytics/tests/test_pageview_cleanup.py b/readthedocs/analytics/tests/test_pageview_cleanup.py new file mode 100644 index 00000000000..487fdb5ca93 --- /dev/null +++ b/readthedocs/analytics/tests/test_pageview_cleanup.py @@ -0,0 +1,45 @@ +import pytest +from django.contrib.auth import get_user_model +from django.contrib.auth.models import User +from readthedocs.projects.models import Project +from readthedocs.analytics.models import PageView +from django.utils import timezone +from django.conf import settings +from django_dynamic_fixture import get + +@pytest.mark.django_db +def test_project_delete_triggers_pageview_cleanup(settings): + + settings.CELERY_TASK_ALWAYS_EAGER = True # Run Celery tasks eagerly + user = get(User) + project = get(Project, users=[user], slug="test-project") + project.save() + + # Create some PageViews for the project + get( + PageView, + project=project, + path="/index.html", + full_path="/en/latest/index.html", + view_count=5, + date=timezone.now().date(), + status=200, + ) + get( + PageView, + project=project, + path="/about.html", + full_path="/en/latest/about.html", + view_count=2, + date=timezone.now().date(), + status=200, + ) + + # The PageViews should be deleted by the background task + assert PageView.objects.filter(project__slug='test-project').count() == 2 + + # Delete the project + project.delete() + + # The PageViews should be deleted by the background task + assert PageView.objects.filter(project__slug='test-project').count() == 0 diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 907abf5d51d..da69335f5d9 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -669,6 +669,7 @@ def save(self, *args, **kwargs): self.update_latest_version() def delete(self, *args, **kwargs): + from readthedocs.analytics.tasks.pageviews import delete_project_pageviews from readthedocs.projects.tasks.utils import clean_project_resources # Remove extra resources @@ -676,6 +677,9 @@ def delete(self, *args, **kwargs): super().delete(*args, **kwargs) + # Trigger background task to delete PageViews for this project + delete_project_pageviews.delay(self.slug) + def clean(self): if self.custom_prefix: self.custom_prefix = validate_custom_prefix(self, self.custom_prefix) From 822ce61ddbed9227522b33a06712910f3916b23e Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Thu, 7 Aug 2025 08:31:17 +0200 Subject: [PATCH 2/5] Fix silly test mistake --- .../analytics/tests/test_pageview_cleanup.py | 100 +++++++++++------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/readthedocs/analytics/tests/test_pageview_cleanup.py b/readthedocs/analytics/tests/test_pageview_cleanup.py index 487fdb5ca93..448f1533c65 100644 --- a/readthedocs/analytics/tests/test_pageview_cleanup.py +++ b/readthedocs/analytics/tests/test_pageview_cleanup.py @@ -1,45 +1,71 @@ -import pytest from django.contrib.auth import get_user_model from django.contrib.auth.models import User +from django.test import TestCase from readthedocs.projects.models import Project from readthedocs.analytics.models import PageView from django.utils import timezone from django.conf import settings from django_dynamic_fixture import get -@pytest.mark.django_db -def test_project_delete_triggers_pageview_cleanup(settings): - - settings.CELERY_TASK_ALWAYS_EAGER = True # Run Celery tasks eagerly - user = get(User) - project = get(Project, users=[user], slug="test-project") - project.save() - - # Create some PageViews for the project - get( - PageView, - project=project, - path="/index.html", - full_path="/en/latest/index.html", - view_count=5, - date=timezone.now().date(), - status=200, - ) - get( - PageView, - project=project, - path="/about.html", - full_path="/en/latest/about.html", - view_count=2, - date=timezone.now().date(), - status=200, - ) - - # The PageViews should be deleted by the background task - assert PageView.objects.filter(project__slug='test-project').count() == 2 - - # Delete the project - project.delete() - - # The PageViews should be deleted by the background task - assert PageView.objects.filter(project__slug='test-project').count() == 0 +class TaskTests(TestCase): + + def setUp(self): + self.user = get(User) + self.project = get(Project, users=[self.user], slug="test-project") + self.project.save() + + # Create some PageViews for the project + get( + PageView, + project=self.project, + path="/index.html", + full_path="/en/latest/index.html", + view_count=5, + date=timezone.now().date(), + status=200, + ) + get( + PageView, + project=self.project, + path="/about.html", + full_path="/en/latest/about.html", + view_count=2, + date=timezone.now().date(), + status=200, + ) + + def test_pageview_cleanup(self, settings): + + settings.CELERY_TASK_ALWAYS_EAGER = True # Run Celery tasks eagerly + user = get(User) + project = get(Project, users=[user], slug="test-project") + project.save() + + # Create some PageViews for the project + get( + PageView, + project=project, + path="/index.html", + full_path="/en/latest/index.html", + view_count=5, + date=timezone.now().date(), + status=200, + ) + get( + PageView, + project=project, + path="/about.html", + full_path="/en/latest/about.html", + view_count=2, + date=timezone.now().date(), + status=200, + ) + + # The PageViews should be deleted by the background task + assert PageView.objects.filter(project__slug='test-project').count() == 2 + + # Delete the project + project.delete() + + # The PageViews should be deleted by the background task + assert PageView.objects.filter(project__slug='test-project').count() == 0 From 94fdd1524055e7f6f993a788b766b1899d3c3231 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Thu, 7 Aug 2025 08:31:42 +0200 Subject: [PATCH 3/5] Remove hack --- readthedocs/analytics/tests/test_pageview_cleanup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/readthedocs/analytics/tests/test_pageview_cleanup.py b/readthedocs/analytics/tests/test_pageview_cleanup.py index 448f1533c65..cdbd4bede5d 100644 --- a/readthedocs/analytics/tests/test_pageview_cleanup.py +++ b/readthedocs/analytics/tests/test_pageview_cleanup.py @@ -36,7 +36,6 @@ def setUp(self): def test_pageview_cleanup(self, settings): - settings.CELERY_TASK_ALWAYS_EAGER = True # Run Celery tasks eagerly user = get(User) project = get(Project, users=[user], slug="test-project") project.save() From d2a62802c766b1535e6c3b239d43ab4f3b3c2e36 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Thu, 7 Aug 2025 08:35:03 +0200 Subject: [PATCH 4/5] Move to project cleanup tasks --- readthedocs/analytics/apps.py | 4 ---- readthedocs/projects/models.py | 6 +----- readthedocs/projects/tasks/utils.py | 5 +++++ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/readthedocs/analytics/apps.py b/readthedocs/analytics/apps.py index 5614e0b66ca..6293f36771e 100644 --- a/readthedocs/analytics/apps.py +++ b/readthedocs/analytics/apps.py @@ -9,7 +9,3 @@ class AnalyticsAppConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "readthedocs.analytics" verbose_name = "Analytics" - - def ready(self): - # Import tasks to ensure they are registered - pass diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index da69335f5d9..40cc0a76f95 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -669,17 +669,13 @@ def save(self, *args, **kwargs): self.update_latest_version() def delete(self, *args, **kwargs): - from readthedocs.analytics.tasks.pageviews import delete_project_pageviews from readthedocs.projects.tasks.utils import clean_project_resources - # Remove extra resources + # Remove HTML files, analytics data, etc. clean_project_resources(self) super().delete(*args, **kwargs) - # Trigger background task to delete PageViews for this project - delete_project_pageviews.delay(self.slug) - def clean(self): if self.custom_prefix: self.custom_prefix = validate_custom_prefix(self, self.custom_prefix) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index 9a1fa8a0184..b21f2b94063 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -9,6 +9,7 @@ from django.db.models import Q from django.utils import timezone +from readthedocs.analytics.tasks.pageviews import delete_project_pageviews from readthedocs.builds.constants import BUILD_FINAL_STATES from readthedocs.builds.constants import BUILD_STATE_CANCELLED from readthedocs.builds.constants import EXTERNAL @@ -95,6 +96,10 @@ def clean_project_resources(project, version=None, version_slug=None): else: project.imported_files.all().delete() + # Remove PageViews for this project async, + # since they can be very slow to delete. + delete_project_pageviews.delay(project.slug) + @app.task() def finish_unhealthy_builds(): From 2accaede66947a84be2a99c7cdce4904f772dab0 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Thu, 7 Aug 2025 08:50:02 +0200 Subject: [PATCH 5/5] Test version delete too --- readthedocs/analytics/tasks/pageviews.py | 20 +++-- .../analytics/tests/test_pageview_cleanup.py | 70 ------------------ readthedocs/analytics/tests/test_tasks.py | 74 +++++++++++++++++++ readthedocs/projects/tasks/utils.py | 2 +- 4 files changed, 89 insertions(+), 77 deletions(-) delete mode 100644 readthedocs/analytics/tests/test_pageview_cleanup.py create mode 100644 readthedocs/analytics/tests/test_tasks.py diff --git a/readthedocs/analytics/tasks/pageviews.py b/readthedocs/analytics/tasks/pageviews.py index b6b15f97e71..1a9f96e410f 100644 --- a/readthedocs/analytics/tasks/pageviews.py +++ b/readthedocs/analytics/tasks/pageviews.py @@ -1,16 +1,24 @@ import structlog -from celery import shared_task from readthedocs.analytics.models import PageView +from readthedocs.worker import app log = structlog.get_logger(__name__) -@shared_task(queue="web") -def delete_project_pageviews(project_slug): +@app.task(queue="web") +def delete_project_pageviews(project_slug, version_slug=None): """ - Delete all PageView objects for a given project slug. + Delete all PageView objects for a given project slug, or a specific version if version_slug is provided. """ - count, _ = PageView.objects.filter(project__slug=project_slug).delete() - log.info("Deleted PageViews for project", project_slug=project_slug, count=count) + queryset = PageView.objects.filter(project__slug=project_slug) + if version_slug is not None: + queryset = queryset.filter(version__slug=version_slug) + count, _ = queryset.delete() + log.info( + "Deleted PageViews for project", + project_slug=project_slug, + version_slug=version_slug, + count=count, + ) diff --git a/readthedocs/analytics/tests/test_pageview_cleanup.py b/readthedocs/analytics/tests/test_pageview_cleanup.py deleted file mode 100644 index cdbd4bede5d..00000000000 --- a/readthedocs/analytics/tests/test_pageview_cleanup.py +++ /dev/null @@ -1,70 +0,0 @@ -from django.contrib.auth import get_user_model -from django.contrib.auth.models import User -from django.test import TestCase -from readthedocs.projects.models import Project -from readthedocs.analytics.models import PageView -from django.utils import timezone -from django.conf import settings -from django_dynamic_fixture import get - -class TaskTests(TestCase): - - def setUp(self): - self.user = get(User) - self.project = get(Project, users=[self.user], slug="test-project") - self.project.save() - - # Create some PageViews for the project - get( - PageView, - project=self.project, - path="/index.html", - full_path="/en/latest/index.html", - view_count=5, - date=timezone.now().date(), - status=200, - ) - get( - PageView, - project=self.project, - path="/about.html", - full_path="/en/latest/about.html", - view_count=2, - date=timezone.now().date(), - status=200, - ) - - def test_pageview_cleanup(self, settings): - - user = get(User) - project = get(Project, users=[user], slug="test-project") - project.save() - - # Create some PageViews for the project - get( - PageView, - project=project, - path="/index.html", - full_path="/en/latest/index.html", - view_count=5, - date=timezone.now().date(), - status=200, - ) - get( - PageView, - project=project, - path="/about.html", - full_path="/en/latest/about.html", - view_count=2, - date=timezone.now().date(), - status=200, - ) - - # The PageViews should be deleted by the background task - assert PageView.objects.filter(project__slug='test-project').count() == 2 - - # Delete the project - project.delete() - - # The PageViews should be deleted by the background task - assert PageView.objects.filter(project__slug='test-project').count() == 0 diff --git a/readthedocs/analytics/tests/test_tasks.py b/readthedocs/analytics/tests/test_tasks.py new file mode 100644 index 00000000000..89d7efb02d8 --- /dev/null +++ b/readthedocs/analytics/tests/test_tasks.py @@ -0,0 +1,74 @@ +from django.contrib.auth import get_user_model +from django.contrib.auth.models import User +from django.test import TestCase +from readthedocs.projects.models import Project +from readthedocs.builds.models import Version +from readthedocs.analytics.models import PageView +from django.utils import timezone +from django.conf import settings +from django_dynamic_fixture import get + +class PageViewTaskTests(TestCase): + + def setUp(self): + self.user = get(User) + self.project = get(Project, users=[self.user], slug="test-project") + self.project.save() + + # Create some PageViews for the project + get( + PageView, + project=self.project, + path="/index.html", + full_path="/en/latest/index.html", + view_count=5, + date=timezone.now().date(), + status=200, + ) + get( + PageView, + project=self.project, + path="/about.html", + full_path="/en/latest/about.html", + view_count=2, + date=timezone.now().date(), + status=200, + ) + + def test_pageview_cleanup(self): + # The PageViews should be present initially + self.assertEqual(PageView.objects.filter(project__slug='test-project').count(), 2) + + # Delete the project + self.project.delete() + + # The PageViews should be deleted by the background task + self.assertEqual(PageView.objects.filter(project__slug='test-project').count(), 0) + + def test_pageview_cleanup_on_version_delete(self): + # Create a version for the project + version = get(Version, project=self.project, slug="v1.0", verbose_name="v1.0") + version.save() + + # Create PageViews for this version + get( + PageView, + project=self.project, + version=version, + path="/v1.0/index.html", + full_path="/en/v1.0/index.html", + view_count=3, + date=timezone.now().date(), + status=200, + ) + + # There should be 3 PageViews now (2 from setUp, 1 for version) + self.assertEqual(PageView.objects.filter(project__slug='test-project').count(), 3) + self.assertEqual(PageView.objects.filter(version__slug=version.slug).count(), 1) + + # Delete the version + version.delete() + + # The PageView for this version should be deleted, others remain + self.assertEqual(PageView.objects.filter(project__slug='test-project').count(), 2) + self.assertEqual(PageView.objects.filter(version__slug=version.slug).count(), 0) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index b21f2b94063..5e90438353c 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -98,7 +98,7 @@ def clean_project_resources(project, version=None, version_slug=None): # Remove PageViews for this project async, # since they can be very slow to delete. - delete_project_pageviews.delay(project.slug) + delete_project_pageviews.delay(project_slug=project.slug, version_slug=version_slug) @app.task()