diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py index e57223c651cb70..770d30ce8363b5 100644 --- a/src/sentry/features/temporary.py +++ b/src/sentry/features/temporary.py @@ -348,6 +348,8 @@ def register_temporary_features(manager: FeatureManager) -> None: manager.add("organizations:release-comparison-performance", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) # Enable replay AI summaries manager.add("organizations:replay-ai-summaries", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) + # Enable replay summary log parsing via Seer RPC + manager.add("organizations:replay-ai-summaries-rpc", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE) # Enable replay list selection manager.add("organizations:replay-list-select", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) # Enable version 2 of release serializer diff --git a/src/sentry/replays/blueprints/api.md b/src/sentry/replays/blueprints/api.md index fd9664806456f6..3679cd7d15c89b 100644 --- a/src/sentry/replays/blueprints/api.md +++ b/src/sentry/replays/blueprints/api.md @@ -581,13 +581,7 @@ A POST request is issued with no body. The URL and authorization context is used - Response 204 -## Replay Summarize [/projects///replays//summarize/] - -- Parameters - - start (optional, string) - ISO 8601 format (`YYYY-MM-DDTHH:mm:ss.sssZ`). - - end (optional, string) - ISO 8601 format. Required if `start` is set. - -`start` and `end` default to the last 90 days. If the replay is not found in the specified time range, this endpoint will 404. +## Replay Summary [/projects///replays//summarize/] ### Fetch Replay Summary Task State [GET] diff --git a/src/sentry/replays/endpoints/project_replay_summary.py b/src/sentry/replays/endpoints/project_replay_summary.py index 97d683e33294de..2392685adbbc63 100644 --- a/src/sentry/replays/endpoints/project_replay_summary.py +++ b/src/sentry/replays/endpoints/project_replay_summary.py @@ -12,6 +12,7 @@ from sentry.api.api_publish_status import ApiPublishStatus from sentry.api.base import region_silo_endpoint from sentry.api.bases.project import ProjectEndpoint, ProjectPermission +from sentry.api.utils import default_start_end_dates from sentry.models.project import Project from sentry.net.http import connection_from_url from sentry.replays.lib.storage import storage @@ -196,6 +197,37 @@ def post(self, request: Request, project: Project, replay_id: str) -> Response: ) num_segments = MAX_SEGMENTS_TO_SUMMARIZE + if features.has( + "organizations:replay-ai-summaries-rpc", project.organization, actor=request.user + ): + start, end = default_start_end_dates() + snuba_response = query_replay_instance( + project_id=project.id, + replay_id=replay_id, + start=start, + end=end, + organization=project.organization, + request_user_id=request.user.id, + ) + if not snuba_response: + return self.respond( + {"detail": "Replay not found."}, + status=404, + ) + + return self.make_seer_request( + SEER_START_TASK_ENDPOINT_PATH, + { + "logs": [], + "use_rpc": True, + "num_segments": num_segments, + "replay_id": replay_id, + "organization_id": project.organization.id, + "project_id": project.id, + "temperature": temperature, + }, + ) + # Fetch the replay's error and trace IDs from the replay_id. snuba_response = query_replay_instance( project_id=project.id, diff --git a/src/sentry/replays/usecases/summarize.py b/src/sentry/replays/usecases/summarize.py index 01d0dc61977409..4a70f5617134e9 100644 --- a/src/sentry/replays/usecases/summarize.py +++ b/src/sentry/replays/usecases/summarize.py @@ -7,19 +7,22 @@ import sentry_sdk from sentry import nodestore +from sentry.api.utils import default_start_end_dates from sentry.constants import ObjectStatus from sentry.issues.grouptype import FeedbackGroup from sentry.models.project import Project -from sentry.replays.query import query_trace_connected_events +from sentry.replays.post_process import process_raw_response +from sentry.replays.query import query_replay_instance, query_trace_connected_events from sentry.replays.usecases.ingest.event_parser import EventType from sentry.replays.usecases.ingest.event_parser import ( get_timestamp_ms as get_replay_event_timestamp_ms, ) from sentry.replays.usecases.ingest.event_parser import parse_network_content_lengths, which +from sentry.replays.usecases.reader import fetch_segments_metadata, iter_segment_data from sentry.search.events.types import SnubaParams from sentry.services.eventstore.models import Event from sentry.snuba.referrer import Referrer -from sentry.utils import json +from sentry.utils import json, metrics logger = logging.getLogger(__name__) @@ -449,3 +452,78 @@ def _parse_url(s: str, trunc_length: int) -> str: if len(s) > trunc_length: return s[:trunc_length] + " [truncated]" return s + + +def rpc_get_replay_summary_logs( + project_id: int, replay_id: str, num_segments: int +) -> dict[str, Any]: + """ + RPC call for Seer. Downloads a replay's segment data, queries associated errors, and parses this into summary logs. + """ + + project = Project.objects.get(id=project_id) + # Last 90 days. We don't support date filters in /summarize/. + start, end = default_start_end_dates() + + # Fetch the replay's error and trace IDs from the replay_id. + snuba_response = query_replay_instance( + project_id=project.id, + replay_id=replay_id, + start=start, + end=end, + organization=project.organization, + request_user_id=None, # This is for the viewed_by_me field which is unused for summaries. + ) + processed_response = process_raw_response( + snuba_response, + fields=[], # Defaults to all fields. + ) + + # 404s should be handled in the originating Sentry endpoint. + # If the replay is missing here just return an empty response. + if not processed_response: + return {"logs": []} + + error_ids = processed_response[0].get("error_ids", []) + trace_ids = processed_response[0].get("trace_ids", []) + + # Fetch same-trace errors. + trace_connected_errors = fetch_trace_connected_errors( + project=project, + trace_ids=trace_ids, + start=start, + end=end, + limit=100, + ) + trace_connected_error_ids = {x["id"] for x in trace_connected_errors} + + # Fetch directly linked errors, if they weren't returned by the trace query. + direct_errors = fetch_error_details( + project_id=project.id, + error_ids=[x for x in error_ids if x not in trace_connected_error_ids], + ) + + error_events = direct_errors + trace_connected_errors + + # Metric names kept for backwards compatibility. + metrics.distribution( + "replays.endpoints.project_replay_summary.direct_errors", + value=len(direct_errors), + ) + metrics.distribution( + "replays.endpoints.project_replay_summary.trace_connected_errors", + value=len(trace_connected_errors), + ) + metrics.distribution( + "replays.endpoints.project_replay_summary.num_trace_ids", + value=len(trace_ids), + ) + + # Download segment data. + segment_md = fetch_segments_metadata(project.id, replay_id, 0, num_segments) + segment_data = iter_segment_data(segment_md) + + # Combine replay and error data and parse into logs. + logs = get_summary_logs(segment_data, error_events, project.id) + + return {"logs": logs} diff --git a/src/sentry/seer/endpoints/seer_rpc.py b/src/sentry/seer/endpoints/seer_rpc.py index ec8bdf64dc60e0..bb02e4a7350998 100644 --- a/src/sentry/seer/endpoints/seer_rpc.py +++ b/src/sentry/seer/endpoints/seer_rpc.py @@ -55,6 +55,7 @@ from sentry.integrations.types import IntegrationProviderSlug from sentry.models.organization import Organization, OrganizationStatus from sentry.models.repository import Repository +from sentry.replays.usecases.summarize import rpc_get_replay_summary_logs from sentry.search.eap.resolver import SearchResolver from sentry.search.eap.spans.definitions import SPAN_DEFINITIONS from sentry.search.eap.types import SearchResolverConfig, SupportedTraceItemType @@ -931,6 +932,9 @@ def send_seer_webhook(*, event_name: str, organization_id: int, payload: dict) - "get_trace_for_transaction": rpc_get_trace_for_transaction, "get_profiles_for_trace": rpc_get_profiles_for_trace, "get_issues_for_transaction": rpc_get_issues_for_transaction, + # + # Replays + "get_replay_summary_logs": rpc_get_replay_summary_logs, } diff --git a/tests/sentry/replays/usecases/test_summarize.py b/tests/sentry/replays/usecases/test_summarize.py index e63e5f164b6d96..af763a0088ea55 100644 --- a/tests/sentry/replays/usecases/test_summarize.py +++ b/tests/sentry/replays/usecases/test_summarize.py @@ -1,16 +1,28 @@ +import uuid +import zlib from collections.abc import Generator +from datetime import UTC, datetime, timedelta from typing import Any -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch import pytest +import requests +from django.conf import settings +from sentry.feedback.lib.utils import FeedbackCreationSource +from sentry.feedback.usecases.ingest.create_feedback import create_feedback_issue +from sentry.replays.lib.storage import FilestoreBlob, RecordingSegmentStorageMeta +from sentry.replays.testutils import mock_replay from sentry.replays.usecases.ingest.event_parser import get_timestamp_unit, which from sentry.replays.usecases.summarize import ( EventDict, _parse_iso_timestamp_to_ms, as_log_message, get_summary_logs, + rpc_get_replay_summary_logs, ) +from sentry.testutils.cases import SnubaTestCase, TransactionTestCase +from sentry.testutils.skips import requires_snuba from sentry.utils import json """ @@ -19,7 +31,7 @@ @patch("sentry.replays.usecases.summarize.fetch_feedback_details") -def test_get_summary_logs(mock_fetch_feedback_details: Mock) -> None: +def test_get_summary_logs_from_segments(mock_fetch_feedback_details: Mock) -> None: def _mock_fetch_feedback(feedback_id: str | None, _project_id: int) -> EventDict | None: if feedback_id == "12345678123456781234567812345678": @@ -834,3 +846,409 @@ def test_parse_iso_timestamp_to_ms() -> None: assert _parse_iso_timestamp_to_ms("invalid timestamp") == 0.0 assert _parse_iso_timestamp_to_ms("") == 0.0 assert _parse_iso_timestamp_to_ms("2023-13-01T12:00:00Z") == 0.0 + + +@requires_snuba +class RpcGetReplaySummaryLogsTestCase( + TransactionTestCase, + SnubaTestCase, +): + def setUp(self) -> None: + super().setUp() + self.replay_id = uuid.uuid4().hex + + def store_replay(self, dt: datetime | None = None, **kwargs: Any) -> None: + replay = mock_replay(dt or datetime.now(UTC), self.project.id, self.replay_id, **kwargs) + response = requests.post( + settings.SENTRY_SNUBA + "/tests/entities/replays/insert", json=[replay] + ) + assert response.status_code == 200 + + def save_recording_segment( + self, segment_id: int, data: bytes, compressed: bool = True, is_archived: bool = False + ) -> None: + metadata = RecordingSegmentStorageMeta( + project_id=self.project.id, + replay_id=self.replay_id, + segment_id=segment_id, + retention_days=30, + file_id=None, + ) + FilestoreBlob().set(metadata, zlib.compress(data) if compressed else data) + + def test_rpc_simple(self) -> None: + data = [ + { + "type": 5, + "timestamp": 0.0, + "data": { + "tag": "breadcrumb", + "payload": {"category": "console", "message": "hello"}, + }, + }, + { + "type": 5, + "timestamp": 0.0, + "data": { + "tag": "breadcrumb", + "payload": {"category": "console", "message": "world"}, + }, + }, + ] + self.save_recording_segment(0, json.dumps(data).encode()) + self.save_recording_segment(1, json.dumps([]).encode()) + self.store_replay() + + response = rpc_get_replay_summary_logs( + self.project.id, + self.replay_id, + 2, + ) + + assert response == {"logs": ["Logged: 'hello' at 0.0", "Logged: 'world' at 0.0"]} + + def test_rpc_with_both_direct_and_trace_connected_errors(self) -> None: + """Test handling of breadcrumbs with both direct and trace connected errors. Error logs should not be duplicated.""" + now = datetime.now(UTC) + trace_id = uuid.uuid4().hex + span_id = "1" + uuid.uuid4().hex[:15] + + # Create a direct error event that is not trace connected. + direct_event_id = uuid.uuid4().hex + direct_error_timestamp = now.timestamp() - 2 + self.store_event( + data={ + "event_id": direct_event_id, + "timestamp": direct_error_timestamp, + "exception": { + "values": [ + { + "type": "ZeroDivisionError", + "value": "division by zero", + } + ] + }, + "contexts": { + "replay": {"replay_id": self.replay_id}, + "trace": { + "type": "trace", + "trace_id": uuid.uuid4().hex, + "span_id": span_id, + }, + }, + }, + project_id=self.project.id, + ) + + # Create a trace connected error event + connected_event_id = uuid.uuid4().hex + connected_error_timestamp = now.timestamp() - 1 + project_2 = self.create_project() + self.store_event( + data={ + "event_id": connected_event_id, + "timestamp": connected_error_timestamp, + "exception": { + "values": [ + { + "type": "ConnectionError", + "value": "Failed to connect to database", + } + ] + }, + "contexts": { + "trace": { + "type": "trace", + "trace_id": trace_id, + "span_id": span_id, + } + }, + }, + project_id=project_2.id, + ) + + # Store the replay with both error IDs and trace IDs + self.store_replay( + error_ids=[direct_event_id], + trace_ids=[trace_id], + ) + + data = [ + { + "type": 5, + "timestamp": float(now.timestamp()), + "data": { + "tag": "breadcrumb", + "payload": {"category": "console", "message": "hello"}, + }, + } + ] + self.save_recording_segment(0, json.dumps(data).encode()) + + response = rpc_get_replay_summary_logs( + self.project.id, + self.replay_id, + 1, + ) + + logs = response["logs"] + assert len(logs) == 3 + assert any("ZeroDivisionError" in log for log in logs) + assert any("division by zero" in log for log in logs) + assert any("ConnectionError" in log for log in logs) + assert any("Failed to connect to database" in log for log in logs) + + def test_rpc_with_feedback_breadcrumb(self) -> None: + """Test handling of a feedback breadcrumb when the feedback + is in nodestore, but hasn't reached Snuba yet. + If the feedback is in Snuba (guaranteed for SDK v8.0.0+), + it should be de-duped like in the duplicate_feedback test below.""" + + now = datetime.now(UTC) + feedback_event_id = uuid.uuid4().hex + + self.store_event( + data={ + "type": "feedback", + "event_id": feedback_event_id, + "timestamp": now.timestamp(), + "contexts": { + "feedback": { + "contact_email": "josh.ferge@sentry.io", + "name": "Josh Ferge", + "message": "Great website!", + "replay_id": self.replay_id, + "url": "https://sentry.sentry.io/feedback/?statsPeriod=14d", + }, + }, + }, + project_id=self.project.id, + ) + self.store_replay() + + data = [ + { + "type": 5, + "timestamp": float(now.timestamp()), + "data": { + "tag": "breadcrumb", + "payload": { + "category": "sentry.feedback", + "data": {"feedbackId": feedback_event_id}, + }, + }, + }, + ] + self.save_recording_segment(0, json.dumps(data).encode()) + + response = rpc_get_replay_summary_logs( + self.project.id, + self.replay_id, + 1, + ) + + logs = response["logs"] + assert len(logs) == 1 + assert "User submitted feedback: 'Great website!'" in logs[0] + + def test_rpc_with_trace_errors_both_datasets(self) -> None: + """Test that trace connected error snuba query works correctly with both datasets.""" + + now = datetime.now(UTC) + project_1 = self.create_project() + project_2 = self.create_project() + + # Create regular error event - errors dataset + event_id_1 = uuid.uuid4().hex + trace_id_1 = uuid.uuid4().hex + timestamp_1 = (now - timedelta(minutes=2)).timestamp() + self.store_event( + data={ + "event_id": event_id_1, + "timestamp": timestamp_1, + "exception": { + "values": [ + { + "type": "ValueError", + "value": "Invalid input", + } + ] + }, + "contexts": { + "trace": { + "type": "trace", + "trace_id": trace_id_1, + "span_id": "1" + uuid.uuid4().hex[:15], + } + }, + }, + project_id=project_1.id, + ) + + # Create feedback event - issuePlatform dataset + event_id_2 = uuid.uuid4().hex + trace_id_2 = uuid.uuid4().hex + timestamp_2 = (now - timedelta(minutes=5)).timestamp() + + feedback_data = { + "type": "feedback", + "event_id": event_id_2, + "timestamp": timestamp_2, + "contexts": { + "feedback": { + "contact_email": "test@example.com", + "name": "Test User", + "message": "Great website", + "replay_id": self.replay_id, + "url": "https://example.com", + }, + "trace": { + "type": "trace", + "trace_id": trace_id_2, + "span_id": "2" + uuid.uuid4().hex[:15], + }, + }, + } + + create_feedback_issue( + feedback_data, project_2, FeedbackCreationSource.NEW_FEEDBACK_ENVELOPE + ) + + # Store the replay with all trace IDs + self.store_replay(trace_ids=[trace_id_1, trace_id_2]) + + data = [ + { + "type": 5, + "timestamp": 0.0, + "data": { + "tag": "breadcrumb", + "payload": {"category": "console", "message": "hello"}, + }, + }, + ] + self.save_recording_segment(0, json.dumps(data).encode()) + + response = rpc_get_replay_summary_logs( + self.project.id, + self.replay_id, + 1, + ) + + logs = response["logs"] + assert len(logs) == 3 + + # Verify that feedback event is included + assert "Great website" in logs[1] + assert "User submitted feedback" in logs[1] + + # Verify that regular error event is included + assert "ValueError" in logs[2] + assert "Invalid input" in logs[2] + assert "User experienced an error" in logs[2] + + @patch("sentry.replays.usecases.summarize.fetch_feedback_details") + def test_rpc_with_trace_errors_duplicate_feedback( + self, mock_fetch_feedback_details: MagicMock + ) -> None: + """Test that duplicate feedback events are filtered. + Duplicates may happen when the replay has a feedback breadcrumb, + and the feedback is also returned from the Snuba query for trace-connected errors.""" + + now = datetime.now(UTC) + feedback_event_id = uuid.uuid4().hex + feedback_event_id_2 = uuid.uuid4().hex + trace_id = uuid.uuid4().hex + trace_id_2 = uuid.uuid4().hex + + # Create feedback event - issuePlatform dataset + feedback_data: dict[str, Any] = { + "type": "feedback", + "event_id": feedback_event_id, + "timestamp": (now - timedelta(minutes=3)).timestamp(), + "contexts": { + "feedback": { + "contact_email": "test@example.com", + "name": "Test User", + "message": "Great website", + "replay_id": self.replay_id, + "url": "https://example.com", + }, + "trace": { + "type": "trace", + "trace_id": trace_id, + "span_id": "1" + uuid.uuid4().hex[:15], + }, + }, + } + + # Create another feedback event - issuePlatform dataset + feedback_data_2: dict[str, Any] = { + "type": "feedback", + "event_id": feedback_event_id_2, + "timestamp": (now - timedelta(minutes=2)).timestamp(), + "contexts": { + "feedback": { + "contact_email": "test2@example.com", + "name": "Test User 2", + "message": "Broken website", + "replay_id": self.replay_id, + "url": "https://example.com", + }, + "trace": { + "type": "trace", + "trace_id": trace_id_2, + "span_id": "1" + uuid.uuid4().hex[:15], + }, + }, + } + + create_feedback_issue( + feedback_data, self.project, FeedbackCreationSource.NEW_FEEDBACK_ENVELOPE + ) + create_feedback_issue( + feedback_data_2, self.project, FeedbackCreationSource.NEW_FEEDBACK_ENVELOPE + ) + + self.store_replay(trace_ids=[trace_id, trace_id_2]) + + # mock SDK feedback event with same event_id as the first feedback event + data = [ + { + "type": 5, + "timestamp": float((now - timedelta(minutes=3)).timestamp()), + "data": { + "tag": "breadcrumb", + "payload": { + "category": "sentry.feedback", + "data": {"feedbackId": feedback_event_id}, + }, + }, + }, + ] + self.save_recording_segment(0, json.dumps(data).encode()) + + # Mock fetch_feedback_details to return a dup of the first feedback event. + # In prod this is from nodestore. We had difficulties writing to nodestore in tests. + mock_fetch_feedback_details.return_value = EventDict( + id=feedback_event_id, + title="User Feedback", + message=feedback_data["contexts"]["feedback"]["message"], + timestamp=float(feedback_data["timestamp"]), + category="feedback", + ) + + response = rpc_get_replay_summary_logs( + self.project.id, + self.replay_id, + 1, + ) + + logs = response["logs"] + + # Verify that only the unique feedback logs are included + assert len(logs) == 2 + assert "User submitted feedback" in logs[0] + assert "Great website" in logs[0] + assert "User submitted feedback" in logs[1] + assert "Broken website" in logs[1]