[SPARK-53260][SQL] Reducing number of JDBC overhead connections creation

vanja-vujovic-db · cloud-fan · commit ee619d3dc2e9 · 2025-08-20T21:54:56.000+08:00
### What changes were proposed in this pull request? JDBC connectors open more connections to remote engines than needed. After logging the results, from 2-4 connections open up while doing a sql(...) command. The 2 connections always invoked both come from JDBCTableCatalog, so connection sharing between those 2 is created. ### Why are the changes needed? To reduce query time. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Locally. ### Was this patch authored or co-authored using generative AI tooling? Closes #51991 from vanja-vujovic-db/connections. Authored-by: vanja-vujovic-db <vanja.vujovic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -54,15 +54,15 @@ object JDBCRDD extends Logging {
    * @throws java.sql.SQLException if the table specification is garbage.
    * @throws java.sql.SQLException if the table contains an unsupported type.
    */
-  def resolveTable(options: JDBCOptions): StructType = {
+  def resolveTable(options: JDBCOptions, conn: Connection): StructType = {
     val url = options.url
     val prepareQuery = options.prepareQuery
     val table = options.tableOrQuery
     val dialect = JdbcDialects.get(url)
     val fullQuery = prepareQuery + dialect.getSchemaQuery(table)
 
     try {
-      getQueryOutputSchema(fullQuery, options, dialect)
+      getQueryOutputSchema(fullQuery, options, dialect, conn)
     } catch {
       case e: SQLException if dialect.isSyntaxErrorBestEffort(e) =>
         throw new SparkException(
@@ -72,20 +72,31 @@ object JDBCRDD extends Logging {
     }
   }
 
+  def resolveTable(options: JDBCOptions): StructType = {
+    JdbcUtils.withConnection(options) {
+      resolveTable(options, _)
+    }
+  }
+
   def getQueryOutputSchema(
-      query: String, options: JDBCOptions, dialect: JdbcDialect): StructType = {
-    Using.resource(dialect.createConnectionFactory(options)(-1)) { conn =>
-      logInfo(log"Generated JDBC query to get scan output schema: ${MDC(SQL_TEXT, query)}")
-      Using.resource(conn.prepareStatement(query)) { statement =>
-        statement.setQueryTimeout(options.queryTimeout)
-        Using.resource(statement.executeQuery()) { rs =>
-          JdbcUtils.getSchema(conn, rs, dialect, alwaysNullable = true,
-            isTimestampNTZ = options.preferTimestampNTZ)
-        }
+      query: String, options: JDBCOptions, dialect: JdbcDialect, conn: Connection): StructType = {
+    logInfo(log"Generated JDBC query to get scan output schema: ${MDC(SQL_TEXT, query)}")
+    Using.resource(conn.prepareStatement(query)) { statement =>
+      statement.setQueryTimeout(options.queryTimeout)
+      Using.resource(statement.executeQuery()) { rs =>
+        JdbcUtils.getSchema(conn, rs, dialect, alwaysNullable = true,
+          isTimestampNTZ = options.preferTimestampNTZ)
       }
     }
   }
 
+  def getQueryOutputSchema(
+      query: String, options: JDBCOptions, dialect: JdbcDialect): StructType = {
+    JdbcUtils.withConnection(options) {
+      getQueryOutputSchema(query, options, dialect, _)
+    }
+  }
+
   /**
    * Prune all but the specified columns from the specified Catalyst schema.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.sql.execution.datasources.v2.jdbc
 
-import java.sql.SQLException
+import java.sql.{Connection, SQLException}
 import java.util
 
 import scala.collection.mutable
@@ -93,19 +93,21 @@ class JDBCTableCatalog extends TableCatalog
   }
 
   override def tableExists(ident: Identifier): Boolean = {
+    JdbcUtils.withConnection(options)(tableExists(ident, _))
+  }
+
+  private def tableExists(ident: Identifier, conn: Connection): Boolean = {
     checkNamespace(ident.namespace())
     val writeOptions = new JdbcOptionsInWrite(
       options.parameters + (JDBCOptions.JDBC_TABLE_NAME -> getTableName(ident)))
-    JdbcUtils.withConnection(options) {
-      JdbcUtils.classifyException(
-        condition = "FAILED_JDBC.TABLE_EXISTS",
-        messageParameters = Map(
-          "url" -> options.getRedactUrl(),
-          "tableName" -> toSQLId(ident)),
-        dialect,
-        description = s"Failed table existence check: $ident",
-        isRuntime = false)(JdbcUtils.tableExists(_, writeOptions))
-    }
+    JdbcUtils.classifyException(
+      condition = "FAILED_JDBC.TABLE_EXISTS",
+      messageParameters = Map(
+        "url" -> options.getRedactUrl(),
+        "tableName" -> toSQLId(ident)),
+      dialect,
+      description = s"Failed table existence check: $ident",
+      isRuntime = false)(JdbcUtils.tableExists(conn, writeOptions))
   }
 
   override def dropTable(ident: Identifier): Boolean = {
@@ -138,28 +140,30 @@ class JDBCTableCatalog extends TableCatalog
   }
 
   override def loadTable(ident: Identifier): Table = {
-    if (!tableExists(ident)) {
-      throw QueryCompilationErrors.noSuchTableError(ident)
-    }
+    JdbcUtils.withConnection(options) { conn =>
+      if (!tableExists(ident, conn)) {
+        throw QueryCompilationErrors.noSuchTableError(ident)
+      }
 
-    val optionsWithTableName = new JDBCOptions(
-      options.parameters + (JDBCOptions.JDBC_TABLE_NAME -> getTableName(ident)))
-    JdbcUtils.classifyException(
-      condition = "FAILED_JDBC.LOAD_TABLE",
-      messageParameters = Map(
-        "url" -> options.getRedactUrl(),
-        "tableName" -> toSQLId(ident)),
-      dialect,
-      description = s"Failed to load table: $ident",
-      isRuntime = false
-    ) {
-      val remoteSchemaFetchMetric = JdbcUtils
-        .createSchemaFetchMetric(SparkSession.active.sparkContext)
-      val schema = SQLMetrics.withTimingNs(remoteSchemaFetchMetric) {
-        JDBCRDD.resolveTable(optionsWithTableName)
+      val optionsWithTableName = new JDBCOptions(
+        options.parameters + (JDBCOptions.JDBC_TABLE_NAME -> getTableName(ident)))
+      JdbcUtils.classifyException(
+        condition = "FAILED_JDBC.LOAD_TABLE",
+        messageParameters = Map(
+          "url" -> options.getRedactUrl(),
+          "tableName" -> toSQLId(ident)),
+        dialect,
+        description = s"Failed to load table: $ident",
+        isRuntime = false
+      ) {
+        val remoteSchemaFetchMetric = JdbcUtils
+          .createSchemaFetchMetric(SparkSession.active.sparkContext)
+        val schema = SQLMetrics.withTimingNs(remoteSchemaFetchMetric) {
+          JDBCRDD.resolveTable(optionsWithTableName, conn)
+        }
+        JDBCTable(ident, schema, optionsWithTableName,
+          Map(JDBCRelation.schemaFetchKey -> remoteSchemaFetchMetric))
       }
-      JDBCTable(ident, schema, optionsWithTableName,
-        Map(JDBCRelation.schemaFetchKey -> remoteSchemaFetchMetric))
     }
   }