@@ -8,23 +8,33 @@ Code related to permanently deleting projects.
88*/
99
1010import getLogger from "@cocalc/backend/logger" ;
11+ import { newCounter } from "@cocalc/backend/metrics" ;
1112import getPool from "@cocalc/database/pool" ;
1213import { getServerSettings } from "@cocalc/database/settings" ;
1314import { callback2 } from "@cocalc/util/async-utils" ;
1415import { KUCALC_ON_PREMISES } from "@cocalc/util/db-schema/site-defaults" ;
1516import { minutes_ago } from "@cocalc/util/misc" ;
16- import { bulk_delete } from "./bulk-delete" ;
17+ import { bulkDelete } from "./bulk-delete" ;
1718import { PostgreSQL } from "./types" ;
1819
1920const log = getLogger ( "db:delete-projects" ) ;
2021
22+ const delete_projects_prom = newCounter (
23+ "database" ,
24+ "delete_projects_total" ,
25+ "Deleting projects and associated data operations counter." ,
26+ [ "op" ] ,
27+ ) ;
28+
2129/*
2230Permanently delete from the database all project records, where the
2331project is explicitly deleted already (so the deleted field is true).
2432Call this function to setup projects for permanent deletion. This blanks
2533the user field so the user no longer can access the project, and we don't
2634know that the user had anything to do with the project. A separate phase
2735later then purges these projects from disk as well as the database.
36+
37+ TODO:it's referenced from postgres-server-queries.coffee, but is it actually used anywhere?
2838*/
2939export async function permanently_unlink_all_deleted_projects_of_user (
3040 db : PostgreSQL ,
@@ -80,15 +90,24 @@ export async function unlink_old_deleted_projects(
8090}
8191
8292const Q_CLEANUP_SYNCSTRINGS = `
83- SELECT p.project_id, s.string_id
84- FROM projects as p
85- INNER JOIN syncstrings as s
93+ SELECT s.string_id, p.project_id
94+ FROM projects as p INNER JOIN syncstrings as s
8695 ON p.project_id = s.project_id
8796WHERE p.deleted = true
88- AND users IS NULL
89- AND p.state ->> 'state' != 'deleted'
97+ AND p.users IS NULL
9098ORDER BY
9199 p.project_id, s.string_id
100+ LIMIT 10000
101+ ` ;
102+
103+ const Q_CLEANUP_PROJECTS = `
104+ SELECT project_id
105+ FROM projects
106+ WHERE deleted = true
107+ AND users IS NULL
108+ AND state ->> 'state' != 'deleted'
109+ ORDER BY created ASC
110+ LIMIT 1000
92111` ;
93112
94113/*
@@ -110,103 +129,53 @@ export async function cleanup_old_projects_data(
110129 const L0 = log . extend ( "cleanup_old_projects_data" ) ;
111130 const L = L0 . debug ;
112131
113- log . debug ( "cleanup_old_projects_data ", { max_run_m, on_prem } ) ;
132+ L ( "args ", { max_run_m, on_prem } ) ;
114133 const start_ts = new Date ( ) ;
115134
116135 const pool = getPool ( ) ;
117- const { rows } = await pool . query ( Q_CLEANUP_SYNCSTRINGS ) ;
118136
119- let num = 0 ;
120- let pid = "" ;
137+ let numSyncStr = 0 ;
138+ let numProj = 0 ;
121139
122- for ( const row of rows ) {
123- const { project_id, string_id } = row ;
140+ while ( true ) {
124141 if ( start_ts < minutes_ago ( max_run_m ) ) {
125- L ( `too much time elapsed, breaking after ${ num } syncstrings` ) ;
126- break ;
142+ L ( `too much time elapsed, breaking after ${ numSyncStr } syncstrings` ) ;
143+ return ;
127144 }
128145
129- L ( `deleting syncstring ${ project_id } /${ string_id } ` ) ;
130- num += 1 ;
131- await callback2 ( db . delete_syncstring , { string_id } ) ;
132-
133- // wait a bit after deleting syncstrings, e.g. to let the standby db catch up
134- await new Promise ( ( done ) => setTimeout ( done , 100 ) ) ;
146+ const { rows : syncstrings } = await pool . query ( Q_CLEANUP_SYNCSTRINGS ) ;
147+ L ( `deleting ${ syncstrings . length } syncstrings` ) ;
148+ for ( const { project_id, string_id } of syncstrings ) {
149+ L ( `deleting syncstring ${ project_id } /${ string_id } ` ) ;
150+ numSyncStr += 1 ;
151+ const t0 = Date . now ( ) ;
152+ await callback2 ( db . delete_syncstring , { string_id } ) ;
153+ const elapsed_ms = Date . now ( ) - t0 ;
154+ delete_projects_prom . labels ( "syncstring" ) . inc ( ) ;
155+ // wait a bit after deleting syncstrings, e.g. to let the standby db catch up
156+ // this ensures a max of "10%" utilization of the database – or wait 1 second
157+ await new Promise ( ( done ) =>
158+ setTimeout ( done , Math . min ( 1000 , elapsed_ms * 9 ) ) ,
159+ ) ;
160+ }
135161
136- // Q_CLEANUP_SYNCSTRINGS orders by project_id, hence we trigger project specific actions when the id changes
137- if ( pid != project_id ) {
138- pid = project_id ;
162+ const { rows : projects } = await pool . query ( Q_CLEANUP_PROJECTS ) ;
163+ L ( `deleting the data of ${ projects . length } projects` ) ;
164+ for ( const { project_id } of projects ) {
139165 const L2 = L0 . extend ( project_id ) . debug ;
166+ delete_projects_prom . labels ( "project" ) . inc ( ) ;
167+ numProj += 1 ;
168+ let delRows = 0 ;
140169
141170 if ( on_prem ) {
142- L2 ( `cleanup_old_projects_data for project_id= ${ project_id } ` ) ;
171+ L2 ( `delete all project files ` ) ;
143172 // TODO: this only works on-prem, and requires the project files to be mounted
144173
145- L2 ( `deleting all shared files in project ${ project_id } ` ) ;
174+ L2 ( `deleting all shared files` ) ;
146175 // TODO: do it directly like above, and also get rid of all those shares in the database
147176
148- const delPublicPaths = await bulk_delete ( {
149- table : "public_paths" ,
150- field : "project_id" ,
151- value : project_id ,
152- } ) ;
153- L2 ( `deleted public_paths ${ delPublicPaths . rowsDeleted } entries` ) ;
154-
155- const delProjectLog = await bulk_delete ( {
156- table : "project_log" ,
157- field : "project_id" ,
158- value : project_id ,
159- } ) ;
160- L2 ( `deleted project_log ${ delProjectLog . rowsDeleted } entries` ) ;
161-
162- const delFileUse = await bulk_delete ( {
163- table : "file_use" ,
164- field : "project_id" ,
165- value : project_id ,
166- } ) ;
167- L2 ( `deleted file_use ${ delFileUse . rowsDeleted } entries` ) ;
168-
169- const delAccessLog = await bulk_delete ( {
170- table : "file_access_log" ,
171- field : "project_id" ,
172- value : project_id ,
173- } ) ;
174- L2 ( `deleted file_access_log ${ delAccessLog . rowsDeleted } entries` ) ;
175-
176- const delJupyterApiLog = await bulk_delete ( {
177- table : "jupyter_api_log" ,
178- field : "project_id" ,
179- value : project_id ,
180- } ) ;
181- L2 ( `deleted jupyter_api_log ${ delJupyterApiLog . rowsDeleted } entries` ) ;
182-
183- for ( const field of [
184- "target_project_id" ,
185- "source_project_id" ,
186- ] as const ) {
187- const delCopyPaths = await bulk_delete ( {
188- table : "copy_paths" ,
189- field,
190- value : project_id ,
191- } ) ;
192- L2 ( `deleted copy_paths/${ field } ${ delCopyPaths . rowsDeleted } entries` ) ;
193- }
194-
195- const delListings = await bulk_delete ( {
196- table : "listings" ,
197- field : "project_id" ,
198- id : "project_id" , // TODO listings has a more complex ID, is this a problem?
199- value : project_id ,
200- } ) ;
201- L2 ( `deleted ${ delListings . rowsDeleted } listings` ) ;
202-
203- const delInviteTokens = await bulk_delete ( {
204- table : "project_invite_tokens" ,
205- field : "project_id" ,
206- value : project_id ,
207- id : "token" ,
208- } ) ;
209- L2 ( `deleted ${ delInviteTokens . rowsDeleted } entries` ) ;
177+ // for now, on-prem only as well. This gets rid of all sorts of data in tables specific to the given project.
178+ delRows += await delete_associated_project_data ( L2 , project_id ) ;
210179 }
211180
212181 // now, that we're done with that project, mark it as state.state ->> 'deleted'
@@ -215,6 +184,80 @@ export async function cleanup_old_projects_data(
215184 project_id,
216185 state : "deleted" ,
217186 } ) ;
187+ L2 (
188+ `finished deleting project data | deleted ${ delRows } entries | setting state.state="deleted"` ,
189+ ) ;
218190 }
191+
192+ if ( projects . length === 0 && syncstrings . length === 0 ) {
193+ L ( `all data of deleted projects and associated syncstrings are deleted.` ) ;
194+ L (
195+ `In total ${ numSyncStr } syncstrings and ${ numProj } projects were processed.` ,
196+ ) ;
197+ return ;
198+ }
199+ }
200+ }
201+
202+ async function delete_associated_project_data (
203+ L2 ,
204+ project_id : string ,
205+ ) : Promise < number > {
206+ let total = 0 ;
207+ // collecting tables, where the primary key is the default (i.e. "id") and
208+ // the field to check is always called "project_id"
209+ const tables = [
210+ "public_paths" ,
211+ "project_log" ,
212+ "file_use" ,
213+ "file_access_log" ,
214+ "jupyter_api_log" ,
215+ "openai_chatgpt_log" ,
216+ ] as const ;
217+
218+ for ( const table of tables ) {
219+ const { rowsDeleted } = await bulkDelete ( {
220+ table,
221+ field : "project_id" ,
222+ value : project_id ,
223+ } ) ;
224+ total += rowsDeleted ;
225+ L2 ( `deleted ${ table } ${ rowsDeleted } entries` ) ;
226+ }
227+
228+ // these tables are different, i.e. another id, or the field to check the project_id value against is called differently
229+
230+ for ( const field of [ "target_project_id" , "source_project_id" ] as const ) {
231+ const { rowsDeleted } = await bulkDelete ( {
232+ table : "copy_paths" ,
233+ field,
234+ value : project_id ,
235+ } ) ;
236+ total += rowsDeleted ;
237+ L2 ( `deleted copy_paths/${ field } ${ rowsDeleted } entries` ) ;
219238 }
239+
240+ {
241+ const { rowsDeleted } = await bulkDelete ( {
242+ table : "listings" ,
243+ field : "project_id" ,
244+ id : "project_id" , // TODO listings has a more complex ID, is this a problem?
245+ value : project_id ,
246+ } ) ;
247+ total += rowsDeleted ;
248+ L2 ( `deleted ${ rowsDeleted } listings` ) ;
249+ }
250+
251+ {
252+ const { rowsDeleted } = await bulkDelete ( {
253+ table : "project_invite_tokens" ,
254+ field : "project_id" ,
255+ value : project_id ,
256+ id : "token" ,
257+ } ) ;
258+ total += rowsDeleted ;
259+ L2 ( `deleted ${ rowsDeleted } entries` ) ;
260+ }
261+
262+ return total ;
220263}
0 commit comments