stackabletech · razvan · Aug 20, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 23, 2025
diff --git a/spark-connect-client/versions.py b/spark-connect-client/versions.py
@@ -5,4 +5,10 @@
         "java-base": "17",
         "python": "3.11",
     },
+    {
+        "product": "4.0.0",
+        "spark-k8s": "4.0.0",
+        "java-base": "17",
+        "python": "3.11",
+    },
 ]
diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile
@@ -61,6 +61,17 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patche
 COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR} /stackable/src/spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR}
 
 RUN <<EOF
+
+# IMPORTANT: HBase connectors don't support Spark 4 yet, so we skip the build.
+# Watch this PR for updates: https://github.com/apache/hbase-connectors/pull/130
+if [[ "${PRODUCT}" == 4* ]]; then
+    # Create this empty directory so that following COPY layers succeed.
+    mkdir -p /stackable/spark/jars
+    # Create a dummy tarball to satisfy the build process for Spark 3.
+    touch hbase-connector-${HBASE_CONNECTOR}-stackable${RELEASE}-src.tar.gz
+    exit 0
+fi
+
 cd "$(/stackable/patchable --images-repo-root=src checkout spark-k8s/hbase-connectors ${HBASE_CONNECTOR})/spark"
 
 NEW_VERSION="${HBASE_CONNECTOR}-stackable${RELEASE}"
@@ -110,6 +121,7 @@ mvn \
     --define hadoop-three.version="${HADOOP_VERSION}" \
     --define hbase.version="${HBASE}" \
     --define skipTests \
+    --define maven.test.skip=true \
     clean package
 
 mkdir -p /stackable/spark/jars
@@ -162,9 +174,6 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
 COPY --from=hadoop-builder --chown=${STACKABLE_USER_UID}:0 /stackable/patched-libs /stackable/patched-libs
 
 # >>> Build spark
-# Compiling the tests takes a lot of time, so we skip them
-# -Dmaven.test.skip=true skips both the compilation and execution of tests
-# -DskipTests skips only the execution
 RUN <<EOF
     # Make Maven aware of custom Stackable libraries
     mv /stackable/patched-libs/maven /root/.m2/repository
@@ -179,15 +188,35 @@ RUN <<EOF
     ORIGINAL_VERSION="${PRODUCT}"
     NEW_VERSION="${PRODUCT}-stackable${RELEASE}"
 
+    STACKABLE_HADOOP_VERSION="${HADOOP_HADOOP}-stackable${RELEASE}"
+
+    MAVEN_BIN="/tmp/apache-maven-${MAVEN_VERSION}/bin/mvn"
     export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
 
-    ./dev/make-distribution.sh \
-    --mvn /tmp/apache-maven-${MAVEN_VERSION}/bin/mvn \
-    -Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE}" \
-    -DskipTests \
-    -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
-    --no-transfer-progress \
-    --batch-mode
+    case "${PRODUCT}" in
+        4*)
+            # The Spark 4 script has a --connect option which is not available in Spark 3.
+            # This option is required to build Spark Connect.
+            # Also this option breaks the Spark 3 build so we ensure it's only provided here.
+            ./dev/make-distribution.sh \
+            --mvn "${MAVEN_BIN}" \
+            --connect \
+            -Dhadoop.version="${STACKABLE_HADOOP_VERSION}" \
+            -DskipTests \
+            -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
+            --no-transfer-progress \
+            --batch-mode
+            ;;
+        *)
+            ./dev/make-distribution.sh \
+            --mvn "${MAVEN_BIN}" \
+            -Dhadoop.version="${STACKABLE_HADOOP_VERSION}" \
+            -DskipTests \
+            -P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
+            --no-transfer-progress \
+            --batch-mode
+            ;;
+    esac
 
     sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" assembly/target/bom.json
 EOF
@@ -198,22 +227,30 @@ EOF
 # we create a new dist/connect folder, and copy them here.
 RUN <<EOF
 
-    # Get the Scala binary version
-    SCALA_BINARY_VERSION=$( \
-        mvn --quiet --non-recursive --no-transfer-progress --batch-mode --file pom.xml \
-        org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
-        -DforceStdout \
-        -Dexpression='project.properties(scala.binary.version)')
+    SCALA_BINARY_VERSION=$(grep "scala.binary.version" pom.xml | head -n1 | awk -F '[<>]' '{print $3}')
 
     mkdir -p dist/connect
     cd dist/connect
 
-    cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
-    cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
-    cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
-
-    # The Spark operator expects a file named spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar without the -stackable${RELEASE} suffix.
+    case "${PRODUCT}" in
+        4*)
+            cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
+            cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
+            cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
+            ;;
+        *)
+            cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
+            cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
+            cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
+            ;;
+    esac
+
+    # This link is needed by the operator and is kept for backwards compatibility.
+    # TODO: remove it at some time in the future.
     ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar"
+    # Link to the spark-connect jar without the stackable suffix and scala version.
+    # This link supersedes the previous link.
+    ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect-${PRODUCT}.jar"
 EOF
 
 # <<< Build spark

diff --git a/spark-k8s/stackable/patches/4.0.0/0001-Update-CycloneDX-plugin.patch b/spark-k8s/stackable/patches/4.0.0/0001-Update-CycloneDX-plugin.patch
@@ -0,0 +1,38 @@
+From 2da5608928018dd017c91b904eb8f84a4f6df78a Mon Sep 17 00:00:00 2001
+From: Razvan-Daniel Mihai <[email protected]>
+Date: Fri, 4 Jul 2025 15:54:55 +0200
+Subject: Update CycloneDX plugin
+
+---
+ dev/make-distribution.sh | 1 -
+ pom.xml                  | 5 +++++
+ 2 files changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
+index 16607e45ae..44e345a245 100755
+--- a/dev/make-distribution.sh
++++ b/dev/make-distribution.sh
+@@ -176,7 +176,6 @@ BUILD_COMMAND=("$MVN" clean package \
+     -Dmaven.javadoc.skip=true \
+     -Dmaven.scaladoc.skip=true \
+     -Dmaven.source.skip \
+-    -Dcyclonedx.skip=true \
+     $@)
+
+ # Actually build the jar
+diff --git a/pom.xml b/pom.xml
+index 443d46a430..632920f100 100644
+--- a/pom.xml
++++ b/pom.xml
+@@ -3327,6 +3327,11 @@
+         <groupId>org.cyclonedx</groupId>
+         <artifactId>cyclonedx-maven-plugin</artifactId>
+         <version>2.8.0</version>
++        <configuration>
++          <projectType>application</projectType>
++          <schemaVersion>1.5</schemaVersion>
++          <skipNotDeployed>false</skipNotDeployed>
++        </configuration>
+         <executions>
+           <execution>
+             <phase>package</phase>
diff --git a/spark-k8s/stackable/patches/4.0.0/patchable.toml b/spark-k8s/stackable/patches/4.0.0/patchable.toml
@@ -0,0 +1,2 @@
+base = "fa33ea000a0bda9e5a3fa1af98e8e85b8cc5e4d4"
+mirror = "https://github.com/stackabletech/spark.git"
diff --git a/spark-k8s/versions.py b/spark-k8s/versions.py
@@ -35,4 +35,22 @@
         "tini": "0.19.0",
         "hbase_connector": "1.0.1",
     },
+    {
+        "product": "4.0.0",
+        "java-base": "17",
+        "java-devel": "17",
+        "python": "3.11",
+        "hadoop/hadoop": "3.4.1",
+        "hbase": "2.6.2",
+        "aws_java_sdk_bundle": "2.24.6",
+        "azure_storage": "7.0.1",  # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
+        "azure_keyvault_core": "1.0.0",  # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
+        "jackson_dataformat_xml": "2.15.2",  # https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.13/3.5.1
+        "stax2_api": "4.2.1",  # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
+        "woodstox_core": "6.5.1",  # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
+        "vector": "0.47.0",
+        "jmx_exporter": "1.3.0",
+        "tini": "0.19.0",
+        "hbase_connector": "1.0.1",
+    },
 ]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		base = "fa33ea000a0bda9e5a3fa1af98e8e85b8cc5e4d4"
		mirror = "https://github.com/stackabletech/spark.git"