Skip to content

Commit 16c64c9

Browse files
committed
feat: add spark support
1 parent c0d29b4 commit 16c64c9

17 files changed

+105
-194
lines changed

src/ydata_profiling/model/pandas/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
dataframe_pandas,
44
describe_boolean_pandas,
55
describe_categorical_pandas,
6-
describe_counts_pandas,
76
describe_date_pandas,
87
describe_file_pandas,
9-
describe_generic_pandas,
108
describe_image_pandas,
119
describe_numeric_pandas,
1210
describe_path_pandas,
@@ -27,10 +25,8 @@
2725
"dataframe_pandas",
2826
"describe_boolean_pandas",
2927
"describe_categorical_pandas",
30-
"describe_counts_pandas",
3128
"describe_date_pandas",
3229
"describe_file_pandas",
33-
"describe_generic_pandas",
3430
"describe_image_pandas",
3531
"describe_numeric_pandas",
3632
"describe_path_pandas",

src/ydata_profiling/model/pandas/describe_counts_pandas.py

Lines changed: 0 additions & 64 deletions
This file was deleted.

src/ydata_profiling/model/pandas/describe_generic_pandas.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

src/ydata_profiling/model/spark/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
dataframe_spark,
44
describe_boolean_spark,
55
describe_categorical_spark,
6-
describe_counts_spark,
76
describe_date_spark,
8-
describe_generic_spark,
97
describe_numeric_spark,
108
describe_supported_spark,
119
duplicates_spark,
@@ -21,9 +19,7 @@
2119
"dataframe_spark",
2220
"describe_boolean_spark",
2321
"describe_categorical_spark",
24-
"describe_counts_spark",
2522
"describe_date_spark",
26-
"describe_generic_spark",
2723
"describe_numeric_spark",
2824
"describe_supported_spark",
2925
"duplicates_spark",

src/ydata_profiling/model/spark/correlations_spark.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Correlations between variables."""
2+
23
from typing import Optional
34

45
import pandas as pd

src/ydata_profiling/model/spark/describe_boolean_spark.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_boolean_1d
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_boolean_1d.register
1011
def describe_boolean_1d_spark(
11-
config: Settings, df: DataFrame, summary: dict
12-
) -> Tuple[Settings, DataFrame, dict]:
12+
config: Settings, df: DataFrame, summary: VarDescription
13+
) -> Tuple[Settings, DataFrame, VarDescription]:
1314
"""Describe a boolean series.
1415
1516
Args:
@@ -20,7 +21,7 @@ def describe_boolean_1d_spark(
2021
A dict containing calculated series description values.
2122
"""
2223

23-
value_counts = summary["value_counts"]
24+
value_counts = summary.value_counts
2425

2526
# get the most common boolean value and its frequency
2627
top = value_counts.first()

src/ydata_profiling/model/spark/describe_categorical_spark.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_categorical_1d
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_categorical_1d.register
1011
def describe_categorical_1d_spark(
11-
config: Settings, df: DataFrame, summary: dict
12-
) -> Tuple[Settings, DataFrame, dict]:
12+
config: Settings, df: DataFrame, summary: VarDescription
13+
) -> Tuple[Settings, DataFrame, VarDescription]:
1314
"""Describe a categorical series.
1415
1516
Args:

src/ydata_profiling/model/spark/describe_date_spark.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from ydata_profiling.config import Settings
88
from ydata_profiling.model.summary_algorithms import describe_date_1d
9+
from ydata_profiling.model.var_description.default import VarDescription
910

1011

1112
def date_stats_spark(df: DataFrame, summary: dict) -> dict:
@@ -21,8 +22,8 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict:
2122

2223
@describe_date_1d.register
2324
def describe_date_1d_spark(
24-
config: Settings, df: DataFrame, summary: dict
25-
) -> Tuple[Settings, DataFrame, dict]:
25+
config: Settings, df: DataFrame, summary: VarDescription
26+
) -> Tuple[Settings, DataFrame, VarDescription]:
2627
"""Describe a date series.
2728
2829
Args:

src/ydata_profiling/model/spark/describe_generic_spark.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

src/ydata_profiling/model/spark/describe_numeric_spark.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
describe_numeric_1d,
1010
histogram_compute,
1111
)
12+
from ydata_profiling.model.var_description.default import VarDescription
1213

1314

14-
def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
15+
def numeric_stats_spark(df: DataFrame, summary: VarDescription) -> dict:
1516
column = df.columns[0]
1617

1718
expr = [
@@ -29,8 +30,8 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
2930

3031
@describe_numeric_1d.register
3132
def describe_numeric_1d_spark(
32-
config: Settings, df: DataFrame, summary: dict
33-
) -> Tuple[Settings, DataFrame, dict]:
33+
config: Settings, df: DataFrame, summary: VarDescription
34+
) -> Tuple[Settings, DataFrame, VarDescription]:
3435
"""Describe a boolean series.
3536
3637
Args:
@@ -51,7 +52,7 @@ def describe_numeric_1d_spark(
5152
summary["kurtosis"] = stats["kurtosis"]
5253
summary["sum"] = stats["sum"]
5354

54-
value_counts = summary["value_counts"]
55+
value_counts = summary.value_counts
5556

5657
n_infinite = (
5758
value_counts.where(F.col(df.columns[0]).isin([np.inf, -np.inf]))
@@ -106,12 +107,12 @@ def describe_numeric_1d_spark(
106107
).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]
107108

108109
# FIXME: move to fmt
109-
summary["p_negative"] = summary["n_negative"] / summary["n"]
110+
summary["p_negative"] = summary["n_negative"] / summary.n
110111
summary["range"] = summary["max"] - summary["min"]
111112
summary["iqr"] = summary["75%"] - summary["25%"]
112113
summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN
113-
summary["p_zeros"] = summary["n_zeros"] / summary["n"]
114-
summary["p_infinite"] = summary["n_infinite"] / summary["n"]
114+
summary["p_zeros"] = summary["n_zeros"] / summary.n
115+
summary["p_infinite"] = summary["n_infinite"] / summary.n
115116

116117
# TODO - enable this feature
117118
# because spark doesn't have an indexing system, there isn't really the idea of monotonic increase/decrease
@@ -124,14 +125,14 @@ def describe_numeric_1d_spark(
124125
# display in pandas display
125126
# the alternative is to do this in spark natively, but it is not trivial
126127
infinity_values = [np.inf, -np.inf]
127-
infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values)
128+
infinity_index = summary.value_counts_without_nan.index.isin(infinity_values)
128129

129130
summary.update(
130131
histogram_compute(
131132
config,
132-
summary["value_counts_without_nan"][~infinity_index].index.values,
133+
summary.value_counts_without_nan[~infinity_index].index.values,
133134
summary["n_distinct"],
134-
weights=summary["value_counts_without_nan"][~infinity_index].values,
135+
weights=summary.value_counts_without_nan[~infinity_index].values,
135136
)
136137
)
137138

0 commit comments

Comments
 (0)