Merge branch 'main' into issues/op-rs/1005

razvan · web-flow · commit 32e461ccc5f4 · 2025-06-23T08:50:44.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file.
   - Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`.
 - The operator now defaults to `AES/CTR/NoPadding` for `dfs.encrypt.data.transfer.cipher.suite` to improve security and performance ([#693]).
 - The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]).
+- Added several properties to `hdfs-site.xml` and `core-site.xml` that improve general performance and reliability ([#696])
 
 ### Changed
 
@@ -51,6 +52,7 @@ All notable changes to this project will be documented in this file.
 [#684]: https://github.com/stackabletech/hdfs-operator/pull/684
 [#693]: https://github.com/stackabletech/hdfs-operator/pull/693
 [#695]: https://github.com/stackabletech/hdfs-operator/pull/695
+[#696]: https://github.com/stackabletech/hdfs-operator/pull/696
 [#697]: https://github.com/stackabletech/hdfs-operator/pull/697
 
 ## [25.3.0] - 2025-03-21
diff --git a/deploy/helm/hdfs-operator/templates/roles.yaml b/deploy/helm/hdfs-operator/templates/roles.yaml
@@ -186,6 +186,8 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   name: {{ include "operator.name" . }}-clusterrole-nodes
+  labels:
+  {{- include "operator.labels" . | nindent 4 }}
 rules:
   - apiGroups:
       - ""
diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs
@@ -664,7 +664,41 @@ fn rolegroup_config_map(
                     )
                     .add("dfs.datanode.registered.hostname", "${env.POD_ADDRESS}")
                     .add("dfs.datanode.registered.port", "${env.DATA_PORT}")
-                    .add("dfs.datanode.registered.ipc.port", "${env.IPC_PORT}");
+                    .add("dfs.datanode.registered.ipc.port", "${env.IPC_PORT}")
+                    // The following two properties are set to "true" because there is a minor chance that data
+                    // written to HDFS is not synced to disk even if a block has been closed.
+                    // Users in HBase can control this explicitly for the WAL, but for flushes and compactions
+                    // I believe they can't as easily (if at all).
+                    // In theory, HBase should be able to recover from these failures, but that comes at a cost
+                    // and there's always a risk.
+                    // Enabling this behavior causes HDFS to sync to disk as soon as possible.
+                    .add("dfs.datanode.sync.behind.writes", "true")
+                    .add("dfs.datanode.synconclose", "true")
+                    // Defaults to 10 since at least 2011.
+                    // This controls the concurrent number of client connections (this includes DataNodes)
+                    // to the NameNode. Ideally, we'd scale this with the number of DataNodes but this would
+                    // lead to restarts of the NameNode.
+                    // This should lead to better performance due to more concurrency.
+                    .add("dfs.namenode.handler.count", "50")
+                    // Defaults to 10 since at least 2012.
+                    // This controls the concurrent number of client connections to the DataNodes.
+                    // We have no idea how many clients there may be, so it's hard to assign a good default.
+                    // Increasing to 50 should lead to better performance due to more concurrency, especially
+                    // with use-cases like HBase.
+                    .add("dfs.datanode.handler.count", "50")
+                    // The following two properties default to 2 and 4 respectively since around 2013.
+                    // They control the number of maximum replication "jobs" a NameNode assigns to
+                    // a DataNode in a single heartbeat.
+                    // Increasing this number will increase network usage during replication events
+                    // but can lead to faster recovery.
+                    .add("dfs.namenode.replication.max-streams", "4")
+                    .add("dfs.namenode.replication.max-streams-hard-limit", "8")
+                    // Defaults to 4096 and hasn't changed since at least 2011.
+                    // The number of threads used for actual data transfer, so not very CPU heavy
+                    // but IO bound. This is why the number is relatively high.
+                    // But today's Java and IO should be able to handle more, so bump it to 8192 for
+                    // better performance/concurrency.
+                    .add("dfs.datanode.max.transfer.threads", "8192");
                 if hdfs.has_https_enabled() {
                     hdfs_site.add("dfs.datanode.registered.https.port", "${env.HTTPS_PORT}");
                 } else {
@@ -683,7 +717,10 @@ fn rolegroup_config_map(
                     .ha_zookeeper_quorum()
                     .security_config(hdfs, cluster_info)
                     .context(BuildSecurityConfigSnafu)?
-                    .enable_prometheus_endpoint();
+                    .enable_prometheus_endpoint()
+                    // The default (4096) hasn't changed since 2009.
+                    // Increase to 128k to allow for faster transfers.
+                    .add("io.file.buffer.size", "131072");
                 if let Some(hdfs_opa_config) = hdfs_opa_config {
                     hdfs_opa_config.add_core_site_config(&mut core_site);
                 }