Skip to content

Commit 12f669f

Browse files
committed
improve heatmap performance by distribution of computeations, downsampling and feature selection #4
1 parent 2f6f049 commit 12f669f

File tree

13 files changed

+216
-96
lines changed

13 files changed

+216
-96
lines changed

config/config.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,14 @@ umap:
3535

3636
##### HEATMAP #####
3737
# information on the ComplexHeatmap parameters: https://jokergoo.github.io/ComplexHeatmap-reference/book/index.html
38-
# distance metrics: for rows and columns. all metrics that are supported by stats::dist() (https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist) and 'pearson','spearman', and 'kendall'.
38+
# distance metrics: for rows and columns. all metrics that are supported by scipy.spatial.distance.pdist (https://docs.scipy.org/doc/scipy-1.14.0/reference/generated/scipy.spatial.distance.pdist.html)
3939
# clustering methods: methods for hierarchical clustering that are supported by stats::hclust() (https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust)
4040
# it is the most resource (memory) intensive method, leave empty [] if not required
4141
heatmap:
42-
metrics: ['spearman']
42+
metrics: ['correlation','cosine']
4343
hclust_methods: ['complete']
44+
n_observations: 1000 # random sampled proportion float [0-1] or absolute number as integer
45+
n_features: 0.5 # highly variable features percentate float [0-1] or absolute number as integer
4446

4547
##### LEIDEN #####
4648
# Leiden clustering applied on UMAP KNN graphs specified by the respective parameters (metric, n_neighbors).

workflow/Snakefile

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,12 @@ rule all:
142142
n_components=[dims for dims in config["umap"]["n_components"] if dims in [2,3]]
143143
) if 2 in config["umap"]["n_components"] or 3 in config["umap"]["n_components"] else [],
144144
# Heatmap
145-
heatmap_plots = expand(os.path.join(result_path,'{sample}','Heatmap','plots','Heatmap_{method}_{metric}.png'),
145+
# distance_matrices = expand(os.path.join(result_path,'{sample}','Heatmap','DistanceMatrix_{metric}_{type}.csv'),
146+
# sample=list(annot.index),
147+
# metric=config["heatmap"]["metrics"],
148+
# type=["observations","features"],
149+
# ),
150+
heatmap_plots = expand(os.path.join(result_path,'{sample}','Heatmap','plots','Heatmap_{metric}_{method}.png'),
146151
sample=list(annot.index),
147152
method=config["heatmap"]["hclust_methods"],
148153
metric=config["heatmap"]["metrics"],
@@ -161,7 +166,7 @@ rule all:
161166
sample=list(annot.index),
162167
index_type = ["external", "internal"] if config["sample_proportion"]>0 else ["external"],
163168
) if len(cluster_methods)>0 else [],
164-
envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['clusterCrit','clustree','ComplexHeatmap','ggplot','umap_leiden','plotly','pymcdm','sklearn']),
169+
envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['clusterCrit','clustree','ComplexHeatmap','ggplot','umap_leiden','plotly','pymcdm']),
165170
configs = os.path.join(config["result_path"],'configs',module_name,'{}_config.yaml'.format(config["project_name"])),
166171
annotations = os.path.join(config["result_path"],'configs',module_name,'{}_annot.csv'.format(config["project_name"])),
167172
resources:

workflow/envs/fastdist_UNUSED.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
- defaults
5+
dependencies:
6+
- scikit-learn
7+
- pandas=1.5.0
8+
- numpy
9+
- numba
10+
- pip
11+
- pip:
12+
- fastdist==1.1.6

workflow/rules/cluster_validation.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ rule validation_external:
8282
mem_mb=config.get("mem", "16000"),
8383
threads: config.get("threads", 1)
8484
conda:
85-
"../envs/sklearn.yaml"
85+
"../envs/umap_leiden.yaml"
8686
log:
8787
os.path.join("logs","rules","validation_external_{sample}.log"),
8888
params:

workflow/rules/clustering.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ rule clustification:
3232
mem_mb=config.get("mem", "16000"),
3333
threads: 8#config.get("threads", 1)
3434
conda:
35-
"../envs/sklearn.yaml"
35+
"../envs/umap_leiden.yaml"
3636
log:
3737
os.path.join("logs","rules","clustification_{sample}_clusterings.log"),
3838
params:

workflow/rules/common.smk

Lines changed: 40 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -14,66 +14,59 @@ def get_data_orientation(wildcards):
1414

1515
def get_umap_sample_paths(wildcards):
1616
return [annot.loc[wildcards.sample,'data'],
17-
os.path.join(config["result_path"],'unsupervised_analysis','{}'.format(wildcards.sample),'UMAP','UMAP_{}_'.format(wildcards.metric)+'{}'.format(max(config["umap"]["n_neighbors"]))+'_graph.pickle')]
17+
os.path.join(result_path,'{}'.format(wildcards.sample),'UMAP','UMAP_{}_'.format(wildcards.metric)+'{}'.format(max(config["umap"]["n_neighbors"]))+'_graph.pickle')]
1818

1919
def get_dimred_paths(wildcards):
2020
path_dict = {}
2121

2222
if wildcards.method=="PCA":
23-
path_dict['dimred_data'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards))
24-
path_dict['dimred_axes'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards))
25-
path_dict['dimred_var'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards))
26-
path_dict['dimred_loadings'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards))
27-
# return {
28-
# 'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
29-
# 'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
30-
# 'dimred_var': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards)),
31-
# 'dimred_loadings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards)),
32-
# 'metadata': annot.loc[wildcards.sample,"metadata"],
33-
# 'metadata_features': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
34-
# }
23+
path_dict['dimred_data'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards))
24+
path_dict['dimred_axes'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards))
25+
path_dict['dimred_var'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards))
26+
path_dict['dimred_loadings'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards))
3527
else:
36-
path_dict['dimred_data'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards))
37-
path_dict['dimred_axes'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards))
38-
# return {
39-
# 'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
40-
# 'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
41-
# 'metadata': annot.loc[wildcards.sample,"metadata"],
42-
# 'metadata_features': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
43-
# }
28+
path_dict['dimred_data'] = os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards))
29+
path_dict['dimred_axes'] = os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards))
4430

4531
# add metadata
4632
path_dict['metadata'] = annot.loc[wildcards.sample,"metadata"]
4733
# add features
48-
path_dict['metadata_features'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
34+
path_dict['metadata_features'] = os.path.join(result_path,wildcards.sample,'metadata_features.csv')
4935
# add clustering results
5036
if len(cluster_methods) > 0:
51-
path_dict['metadata_clusterings'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
37+
path_dict['metadata_clusterings'] = os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
5238

5339
return path_dict
5440

5541
def get_dimred_features_paths(wildcards):
5642

5743
if wildcards.method=="PCA":
5844
return {
59-
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
60-
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
61-
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
45+
'dimred_data': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
46+
'dimred_axes': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
47+
'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
6248
}
6349
else:
6450
return {
65-
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
66-
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
67-
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
51+
'dimred_data': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
52+
'dimred_axes': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
53+
'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
6854
}
6955

56+
########## HEATMAPS ##########
57+
def get_heatmap_paths(wildcards):
58+
return {'data': annot.loc[wildcards.sample,'data'],
59+
'metadata': annot.loc[wildcards.sample,"metadata"],
60+
'observations_distance': os.path.join(result_path,wildcards.sample,'Heatmap','DistanceMatrix_{wildcards.metric}_observations.csv'.format(wildcards=wildcards)),
61+
'features_distance': os.path.join(result_path,wildcards.sample,'Heatmap','DistanceMatrix_{wildcards.metric}_features.csv'.format(wildcards=wildcards)),
62+
}
7063

7164
########## CLUSTERING ##########
7265

7366
# get paths for clustification
7467
def get_clustification_paths(wildcards):
7568
return [annot.loc[wildcards.sample,'data'],
76-
os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'{}'.format(config["clustification"]["method"]),'{}_clusterings.csv'.format(config["clustification"]["method"]))
69+
os.path.join(result_path,wildcards.sample,'{}'.format(config["clustification"]["method"]),'{}_clusterings.csv'.format(config["clustification"]["method"]))
7770
]
7871

7972
# get all clustering results of one method to be aggregated into {method}/{method}_clusterings.csv
@@ -90,33 +83,31 @@ def get_clustering_paths(wildcards):
9083
else:
9184
leiden_parameters.append("{}_NA".format(partition_type))
9285

93-
path_list = path_list + expand(os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'Leiden','Leiden_{metric}_{n_neighbors}_{leiden_parameters}_clustering.csv'),
86+
path_list = path_list + expand(os.path.join(result_path,wildcards.sample,'Leiden','Leiden_{metric}_{n_neighbors}_{leiden_parameters}_clustering.csv'),
9487
metric=config["leiden"]["metrics"],
9588
n_neighbors=config["leiden"]["n_neighbors"],
9689
leiden_parameters=leiden_parameters,
97-
# partition_type=config["leiden"]["partition_types"],
98-
# resolution=config["leiden"]["resolutions"]
9990
)
10091
return path_list
10192

10293
# get all aggregated clustering results across methods to be aggregated into {sample}/metadata_clusterings.csv
10394
def get_aggregated_clustering_paths(wildcards):
104-
return expand(os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'{method}','{method}_clusterings.csv'), method=cluster_methods)
95+
return expand(os.path.join(result_path,wildcards.sample,'{method}','{method}_clusterings.csv'), method=cluster_methods)
10596

10697
# get the aggregated clustering results across methods for visualization
10798
def get_metadata_clustering_paths(wildcards):
10899

109100
if wildcards.method=="PCA":
110101
return {
111-
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
112-
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
113-
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
102+
'dimred_data': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
103+
'dimred_axes': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
104+
'metadata': os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
114105
}
115106
else:
116107
return {
117-
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
118-
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
119-
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
108+
'dimred_data': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
109+
'dimred_axes': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
110+
'metadata': os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
120111
}
121112

122113
########## CLUSTER VALIDATION ##########
@@ -126,36 +117,36 @@ def get_clustree_paths(wildcards):
126117

127118
if wildcards.content=="features":
128119
return {
129-
'metadata_clustering': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
130-
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
120+
'metadata_clustering': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
121+
'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
131122
}
132123
else:
133124
return {
134-
'metadata_clustering': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
125+
'metadata_clustering': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
135126
'metadata': annot.loc[wildcards.sample,"metadata"]
136127
}
137128

138129
# get paths to determine external cluster indices
139130
def get_external_validation_paths(wildcards):
140-
return {'clusterings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
131+
return {'clusterings': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
141132
'metadata': annot.loc[wildcards.sample,"metadata"]
142133
}
143134

144135
# get paths to determine internal cluster indices
145136
def get_internal_validation_paths(wildcards):
146137
return {#'data': annot.loc[wildcards.sample,'data'],
147138
'metadata': annot.loc[wildcards.sample,"metadata"],
148-
'clusterings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
149-
'pca': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_default_data.csv'),
150-
'pca_var': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_default_var.csv')
139+
'clusterings': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
140+
'pca': os.path.join(result_path,wildcards.sample,'PCA','PCA_{}_{}_data.csv'.format(config["pca"]["svd_solver"],config["pca"]["n_components"])),
141+
'pca_var': os.path.join(result_path,wildcards.sample,'PCA','PCA_{}_{}_var.csv'.format(config["pca"]["svd_solver"],config["pca"]["n_components"]))
151142
}
152143

153144
# for plotting heatmaps of cluster indices
154145
def get_validation_paths(wildcards):
155146
if wildcards.type=="external":
156147
return {
157-
idx: os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "cluster_validation", "external_index_{}.csv".format(idx)) for idx in indices_external
148+
idx: os.path.join(result_path,wildcards.sample, "cluster_validation", "external_index_{}.csv".format(idx)) for idx in indices_external
158149
}
159150
else:
160-
return {"ranked_internal_indices": os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "cluster_validation", "internal_indices_ranked.csv")}
151+
return {"ranked_internal_indices": os.path.join(result_path,wildcards.sample, "cluster_validation", "internal_indices_ranked.csv")}
161152

0 commit comments

Comments
 (0)