From f46f92f6a1c818d881278008921cf47ee87e064e Mon Sep 17 00:00:00 2001 From: landon Date: Sun, 28 Sep 2025 16:32:37 +0800 Subject: [PATCH 1/2] Add internationalization (i18n) implementation --- docs/features/internationalization.md | 40 ++++++ examples/i18n/i18n_example.py | 42 +++++++ src/ydata_profiling/config.py | 40 +++++- src/ydata_profiling/config_default.yaml | 5 + src/ydata_profiling/config_minimal.yaml | 5 + src/ydata_profiling/i18n/__init__.py | 116 ++++++++++++++++++ src/ydata_profiling/i18n/locales/en.json | 104 ++++++++++++++++ src/ydata_profiling/i18n/locales/zh.json | 104 ++++++++++++++++ src/ydata_profiling/profile_report.py | 30 ++++- .../presentation/flavours/html/__init__.py | 2 + .../flavours/html/i18n_extension.py | 33 +++++ .../presentation/flavours/html/templates.py | 15 ++- .../templates/sequence/overview_tabs.html | 2 +- .../report/structure/report.py | 20 +-- src/ydata_profiling/utils/locale_utils.py | 45 +++++++ tests/unit/test_i18n.py | 41 +++++++ 16 files changed, 628 insertions(+), 16 deletions(-) create mode 100644 docs/features/internationalization.md create mode 100644 examples/i18n/i18n_example.py create mode 100644 src/ydata_profiling/i18n/__init__.py create mode 100644 src/ydata_profiling/i18n/locales/en.json create mode 100644 src/ydata_profiling/i18n/locales/zh.json create mode 100644 src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py create mode 100644 src/ydata_profiling/utils/locale_utils.py create mode 100644 tests/unit/test_i18n.py diff --git a/docs/features/internationalization.md b/docs/features/internationalization.md new file mode 100644 index 000000000..3a0e1d64c --- /dev/null +++ b/docs/features/internationalization.md @@ -0,0 +1,40 @@ +# 国际化支持 + +ydata-profiling 现在支持多语言报告生成。 + +## 支持的语言 + +- 英文 (en) - 默认 +- 中文 (zh) + +## 使用方法 + +### 方法1:全局设置语言 + +```python +from ydata_profiling import ProfileReport +from ydata_profiling.i18n import set_locale + +# 设置中文 +set_locale('zh') +profile = ProfileReport(df) +profile.to_file('中文报告.html') +``` + +### 方法2:在初始化时指定语言 +```python +# 直接指定语言 +profile = ProfileReport(df, locale='zh', title='我的数据报告') +profile.to_file('报告.html') +``` +### 方法3:通过配置文件 +```python +# config.yaml +i18n: + locale: "zh" + auto_detect: false + fallback_locale: "en" +``` +```python +profile = ProfileReport(df, config_file='config.yaml') +``` diff --git a/examples/i18n/i18n_example.py b/examples/i18n/i18n_example.py new file mode 100644 index 000000000..faca0a566 --- /dev/null +++ b/examples/i18n/i18n_example.py @@ -0,0 +1,42 @@ +""" +Example of using ydata-profiling with internationalization +""" +import pandas as pd +from ydata_profiling import ProfileReport +from ydata_profiling.i18n import set_locale +from ydata_profiling.utils.locale_utils import auto_set_locale + +# Create sample data +df = pd.DataFrame({ + 'numeric_column': [1, 2, 3, 4, 5], + 'categorical_column': ['A', 'B', 'A', 'C', 'B'], + 'text_column': ['Hello', 'World', 'Test', 'Data', 'Science'] +}) + +# Use the default report generation method +print("Default report generation report...") +profile_default = ProfileReport(df, title="Default Data Profiling Report") +profile_default.to_file("default_report.html") + +# Auto-detect and set language +print("Auto-detect generation report...") +auto_set_locale() +profile_zh = ProfileReport(df, title="Auto Detect Data Profiling Report") +profile_zh.to_file("auto_report_chinese.html") + +# Generate a report in English +print("Generating English report...") +set_locale('en') +profile_en = ProfileReport(df, title="English Data Profiling Report") +profile_en.to_file("report_english.html") + +# Generate a report in Chinese +print("Generating Chinese report...") +set_locale('zh') +profile_zh = ProfileReport(df, title="中文数据分析报告") +profile_zh.to_file("report_chinese.html") + +# Specify the language during initialization +print("Generating report with locale parameter...") +profile_locale = ProfileReport(df, title="报告标题", locale='zh') +profile_locale.to_file("report_with_locale.html") \ No newline at end of file diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..d02b677df 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -5,6 +5,7 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr +from ydata_profiling.i18n import set_locale, get_locale def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: @@ -290,6 +291,14 @@ class Report(BaseModel): precision: int = 8 +# Add a new configuration item in the Settings class +class I18n(BaseModel): + """Internationalization settings""" + locale: str = "en" + auto_detect: bool = True + fallback_locale: str = "en" + + class Settings(BaseSettings): # Default prefix to avoid collisions with environment variables class Config: @@ -355,6 +364,24 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() + # Add internationalization configuration + i18n: I18n = I18n() + + def __init__(self, **data): + # Check the current language setting before initialization + current_locale = get_locale() + + super().__init__(**data) + + # If no locale is explicitly specified and there is currently a non-default language setting, + # maintain the current setting + if 'i18n' not in data and current_locale != 'en': + self.i18n.locale = current_locale + + # Set locale + if self.i18n.locale: + set_locale(self.i18n.locale) + def update(self, updates: dict) -> "Settings": update = _merge_dictionaries(self.dict(), updates) return self.parse_obj(self.copy(update=update)) @@ -368,10 +395,21 @@ def from_file(config_file: Union[Path, str]) -> "Settings": Returns: Settings """ + # Save current language settings + current_locale = get_locale() + with open(config_file) as f: data = yaml.safe_load(f) - return Settings.parse_obj(data) + settings = Settings.parse_obj(data) + + # If no language is specified in the configuration file and there is currently a non-default language setting, + # maintain the current setting + if 'i18n' not in data and current_locale != 'en': + settings.i18n.locale = current_locale + set_locale(current_locale) + + return settings class SparkSettings(Settings): diff --git a/src/ydata_profiling/config_default.yaml b/src/ydata_profiling/config_default.yaml index a72b8bdc2..cddd5d1f2 100644 --- a/src/ydata_profiling/config_default.yaml +++ b/src/ydata_profiling/config_default.yaml @@ -221,3 +221,8 @@ html: - "#198754" full_width: false + +i18n: + locale: "en" + auto_detect: true + fallback_locale: "en" \ No newline at end of file diff --git a/src/ydata_profiling/config_minimal.yaml b/src/ydata_profiling/config_minimal.yaml index a3cb46211..dd364e7be 100644 --- a/src/ydata_profiling/config_minimal.yaml +++ b/src/ydata_profiling/config_minimal.yaml @@ -220,3 +220,8 @@ html: - "#198754" full_width: false + +i18n: + locale: "en" + auto_detect: true + fallback_locale: "en" \ No newline at end of file diff --git a/src/ydata_profiling/i18n/__init__.py b/src/ydata_profiling/i18n/__init__.py new file mode 100644 index 000000000..e67a12460 --- /dev/null +++ b/src/ydata_profiling/i18n/__init__.py @@ -0,0 +1,116 @@ +""" +Internationalization module for ydata-profiling +""" +import os +import json +from pathlib import Path +from typing import Dict, Optional +import threading + + +class TranslationManager: + """Manages translations for ydata-profiling""" + + _instance = None + _lock = threading.Lock() + + def __new__(cls): + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if not hasattr(self, 'initialized'): + self.translations: Dict[str, Dict[str, str]] = {} + self.current_locale = 'en' + self.fallback_locale = 'en' + self.initialized = True + self._load_translations() + + def _load_translations(self): + """Load all translation files""" + translations_dir = Path(__file__).parent / 'locales' + if not translations_dir.exists(): + return + + for locale_file in translations_dir.glob('*.json'): + locale = locale_file.stem + try: + with open(locale_file, 'r', encoding='utf-8') as f: + self.translations[locale] = json.load(f) + except Exception as e: + print(f"Warning: Failed to load translation file {locale_file}: {e}") + + def set_locale(self, locale: str): + """Set the current locale""" + if locale in self.translations or locale == self.fallback_locale: + self.current_locale = locale + else: + print(f"Warning: Locale '{locale}' not found, using fallback '{self.fallback_locale}'") + + def get_translation(self, key: str, locale: Optional[str] = None, **kwargs) -> str: + """Get translation for a key""" + target_locale = locale or self.current_locale + + # Try current locale + if target_locale in self.translations: + translation = self._get_nested_value(self.translations[target_locale], key) + if translation: + return self._format_translation(translation, **kwargs) + + # Try fallback locale + if target_locale != self.fallback_locale and self.fallback_locale in self.translations: + translation = self._get_nested_value(self.translations[self.fallback_locale], key) + if translation: + return self._format_translation(translation, **kwargs) + + # Return key if no translation found + return key + + def _get_nested_value(self, data: dict, key: str) -> Optional[str]: + """Get nested value from dictionary using dot notation""" + keys = key.split('.') + current = data + for k in keys: + if isinstance(current, dict) and k in current: + current = current[k] + else: + return None + return current if isinstance(current, str) else None + + def _format_translation(self, translation: str, **kwargs) -> str: + """Format translation with parameters""" + try: + return translation.format(**kwargs) + except (KeyError, ValueError): + return translation + + +# Global translation manager instance +_translation_manager = TranslationManager() + + +def set_locale(locale: str): + """Set the global locale""" + _translation_manager.set_locale(locale) + + +def get_locale() -> str: + """Get the current locale""" + return _translation_manager.current_locale + + +def _(key: str, **kwargs) -> str: + """Translation function - shorthand for get_translation""" + return _translation_manager.get_translation(key, **kwargs) + + +def t(key: str, **kwargs) -> str: + """Translation function - alias for _()""" + return _(key, **kwargs) + + +# Export main functions +__all__ = ['set_locale', 'get_locale', '_', 't', 'TranslationManager'] \ No newline at end of file diff --git a/src/ydata_profiling/i18n/locales/en.json b/src/ydata_profiling/i18n/locales/en.json new file mode 100644 index 000000000..ce8647d02 --- /dev/null +++ b/src/ydata_profiling/i18n/locales/en.json @@ -0,0 +1,104 @@ +{ + "report": { + "title": "YData Profiling Report", + "overview": "Overview", + "variables": "Variables", + "interactions": "Interactions", + "correlations": "Correlations", + "missing_values": "Missing values", + "sample": "Sample", + "duplicates": "Duplicate rows", + "brought_to_you_by": "Brought to you by YData", + "footer_text": "Report generated by YData." + }, + "overview": { + "dataset_info": "Dataset info", + "variable_types": "Variable types", + "dataset_statistics": "Dataset statistics", + "number_of_variables": "Number of variables", + "number_of_observations": "Number of observations", + "missing_cells": "Missing cells", + "missing_cells_percentage": "Missing cells (%)", + "duplicate_rows": "Duplicate rows", + "duplicate_rows_percentage": "Duplicate rows (%)", + "total_size_in_memory": "Total size in memory", + "average_record_size": "Average record size in memory" + }, + "variables": { + "numeric": "Numeric", + "categorical": "Categorical", + "boolean": "Boolean", + "date": "Date", + "text": "Text", + "url": "URL", + "path": "Path", + "image": "Image", + "distinct": "Distinct", + "distinct_percentage": "Distinct (%)", + "missing": "Missing", + "missing_percentage": "Missing (%)", + "memory_size": "Memory size", + "statistics": "Statistics", + "quantile_statistics": "Quantile statistics", + "common_values": "Common values", + "extreme_values": "Extreme values", + "histogram": "Histogram", + "min": "Minimum", + "max": "Maximum", + "mean": "Mean", + "mode": "Mode", + "standard_deviation": "Standard deviation", + "sum": "Sum", + "mad": "Median Absolute Deviation (MAD)", + "coefficient_of_variation": "Coefficient of variation (CV)", + "kurtosis": "Kurtosis", + "skewness": "Skewness", + "range": "Range", + "interquartile_range": "Interquartile range (IQR)", + "length": "Length", + "sample": "Sample" + }, + "correlations": { + "pearson": "Pearson's r", + "spearman": "Spearman's ρ", + "kendall": "Kendall's τ", + "phi_k": "Phik (φk)", + "cramers": "Cramér's V (φc)", + "auto": "Auto" + }, + "interactions": { + "scatter_plot": "Scatter plot", + "variable": "Variable" + }, + "missing_values": { + "matrix": "Matrix", + "bar_chart": "Bar chart", + "heatmap": "Heatmap", + "dendrogram": "Dendrogram" + }, + "alerts": { + "high_correlation": "High correlation", + "high_cardinality": "High cardinality", + "constant": "Constant", + "zeros": "Zeros", + "missing": "Missing", + "skewed": "Skewed", + "infinite": "Infinite", + "type_date": "Date", + "uniform": "Uniform", + "unique": "Unique" + }, + "formatting": { + "bytes": "{value} bytes", + "kb": "{value} KB", + "mb": "{value} MB", + "gb": "{value} GB", + "percentage": "{value}%" + }, + "rendering": { + "generate_structure": "Generate report structure", + "html_progress": "Render HTML", + "json_progress": "Render JSON", + "widgets_progress": "Render widgets" + } +} \ No newline at end of file diff --git a/src/ydata_profiling/i18n/locales/zh.json b/src/ydata_profiling/i18n/locales/zh.json new file mode 100644 index 000000000..d941067d6 --- /dev/null +++ b/src/ydata_profiling/i18n/locales/zh.json @@ -0,0 +1,104 @@ +{ + "report": { + "title": "YData 数据分析报告", + "overview": "概览", + "variables": "变量", + "interactions": "交互分析", + "correlations": "相关性", + "missing_values": "缺失值", + "sample": "样本数据", + "duplicates": "重复数据", + "brought_to_you_by": "由 YData 为您提供", + "footer_text": "报告由 YData 生成。" + }, + "overview": { + "dataset_info": "数据集信息", + "variable_types": "变量类型", + "dataset_statistics": "数据集统计", + "number_of_variables": "变量数量", + "number_of_observations": "观测数量", + "missing_cells": "缺失单元格", + "missing_cells_percentage": "缺失单元格百分比", + "duplicate_rows": "重复行", + "duplicate_rows_percentage": "重复行百分比", + "total_size_in_memory": "内存总大小", + "average_record_size": "平均记录内存大小" + }, + "variables": { + "numeric": "数值型", + "categorical": "分类型", + "boolean": "布尔型", + "date": "日期型", + "text": "文本型", + "url": "网址型", + "path": "路径型", + "image": "图像型", + "distinct": "唯一值", + "distinct_percentage": "唯一值百分比", + "missing": "缺失值", + "missing_percentage": "缺失值百分比", + "memory_size": "内存大小", + "statistics": "统计信息", + "quantile_statistics": "分位数统计", + "common_values": "常见值", + "extreme_values": "极值", + "histogram": "直方图", + "min": "最小值", + "max": "最大值", + "mean": "均值", + "mode": "众数", + "standard_deviation": "标准差", + "sum": "总和", + "mad": "中位绝对偏差 (MAD)", + "coefficient_of_variation": "变异系数 (CV)", + "kurtosis": "峰度", + "skewness": "偏度", + "range": "范围", + "interquartile_range": "四分位距 (IQR)", + "length": "长度", + "sample": "样本" + }, + "correlations": { + "pearson": "皮尔逊相关系数", + "spearman": "斯皮尔曼相关系数", + "kendall": "肯德尔相关系数", + "phi_k": "Phik相关系数 (φk)", + "cramers": "克拉默V系数 (φc)", + "auto": "自动" + }, + "interactions": { + "scatter_plot": "散点图", + "variable": "变量" + }, + "missing_values": { + "matrix": "矩阵图", + "bar_chart": "条形图", + "heatmap": "热力图", + "dendrogram": "树状图" + }, + "alerts": { + "high_correlation": "高相关性", + "high_cardinality": "高基数", + "constant": "常量", + "zeros": "零值", + "missing": "缺失", + "skewed": "偏斜", + "infinite": "无穷值", + "type_date": "日期", + "uniform": "均匀分布", + "unique": "唯一值" + }, + "formatting": { + "bytes": "{value} 字节", + "kb": "{value} KB", + "mb": "{value} MB", + "gb": "{value} GB", + "percentage": "{value}%" + }, + "rendering": { + "generate_structure": "生成报告结构", + "html_progress": "渲染HTML报告", + "json_progress": "渲染JSON报告", + "widgets_progress": "渲染组件" + } +} \ No newline at end of file diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index acf18c7e0..3d5cd7076 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -47,6 +47,7 @@ from ydata_profiling.utils.dataframe import hash_dataframe from ydata_profiling.utils.logger import ProfilingLogger from ydata_profiling.utils.paths import get_config +from ydata_profiling.i18n import _, set_locale, get_locale logger = ProfilingLogger(name="ReportLogger") @@ -80,6 +81,7 @@ def __init__( summarizer: Optional[BaseSummarizer] = None, config: Optional[Settings] = None, type_schema: Optional[dict] = None, + locale: Optional[str] = None, **kwargs, ): """Generate a ProfileReport based on a pandas or spark.sql DataFrame @@ -103,8 +105,21 @@ def __init__( typeset: optional user typeset to use for type inference summarizer: optional user summarizer to generate custom summary output type_schema: optional dict containing pairs of `column name`: `type` + locale: language locale for report generation (e.g., 'en', 'zh') **kwargs: other arguments, for valid arguments, check the default configuration file. """ + from ydata_profiling.i18n import set_locale, get_locale + + # Save current language settings + current_locale = get_locale() + + # If the locale parameter is explicitly specified, use it + if locale: + set_locale(locale) + target_locale = locale + else: + # Otherwise, use the current global language setting + target_locale = current_locale self.__validate_inputs(df, minimal, tsmode, config_file, lazy) @@ -123,6 +138,11 @@ def __init__( else: report_config = SparkSettings() + # Ensure that the language settings in the configuration are consistent with the target language + if target_locale != 'en': + report_config.i18n.locale = target_locale + set_locale(target_locale) + groups = [ (explorative, "explorative"), (sensitive, "sensitive"), @@ -144,6 +164,10 @@ def __init__( if kwargs: report_config = report_config.update(kwargs) + # Finally ensure the language setting is correct + report_config.i18n.locale = target_locale + set_locale(target_locale) + report_config.vars.timeseries.active = tsmode if tsmode and sortby: report_config.vars.timeseries.sortby = sortby @@ -411,7 +435,7 @@ def _render_html(self) -> str: report = self.report with tqdm( - total=1, desc="Render HTML", disable=not self.config.progress_bar + total=1, desc=_("rendering.html_progress"), disable=not self.config.progress_bar ) as pbar: html = HTMLReport(copy.deepcopy(report)).render( nav=self.config.html.navbar_show, @@ -440,7 +464,7 @@ def _render_widgets(self) -> Any: with tqdm( total=1, - desc="Render widgets", + desc=_("rendering.widgets_progress"), disable=not self.config.progress_bar, leave=False, ) as pbar: @@ -477,7 +501,7 @@ def encode_it(o: Any) -> Any: description = self.description_set with tqdm( - total=1, desc="Render JSON", disable=not self.config.progress_bar + total=1, desc=_("rendering.json_progress"), disable=not self.config.progress_bar ) as pbar: description_dict = format_summary(description) description_dict = encode_it(description_dict) diff --git a/src/ydata_profiling/report/presentation/flavours/html/__init__.py b/src/ydata_profiling/report/presentation/flavours/html/__init__.py index 50e234f77..72be96f2d 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/__init__.py +++ b/src/ydata_profiling/report/presentation/flavours/html/__init__.py @@ -25,6 +25,7 @@ from ydata_profiling.report.presentation.flavours.html.variable_info import ( HTMLVariableInfo, ) +from ydata_profiling.report.presentation.flavours.html.i18n_extension import I18nExtension __all__ = [ "HTMLCollapse", @@ -44,4 +45,5 @@ "HTMLAlerts", "HTMLCorrelationTable", "HTMLScores", + "I18nExtension", ] diff --git a/src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py b/src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py new file mode 100644 index 000000000..47dbf0f5e --- /dev/null +++ b/src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py @@ -0,0 +1,33 @@ +""" +Jinja2 internationalization extension for ydata-profiling +""" +from jinja2 import nodes +from jinja2.ext import Extension +from ydata_profiling.i18n import _ + +class I18nExtension(Extension): + """Jinja2 extension for internationalization""" + + tags = {'trans'} + + def __init__(self, environment): + super().__init__(environment) + environment.globals['_'] = _ + environment.globals['gettext'] = _ + environment.filters['trans'] = self.translate_filter + + def translate_filter(self, key, **kwargs): + """Filter for translating keys in templates""" + return _(key, **kwargs) + + def parse(self, parser): + """Parse trans tag for {% trans %} syntax""" + lineno = next(parser.stream).lineno + key = parser.parse_expression() + return nodes.Output([ + nodes.Call( + nodes.Name('_', 'load'), + [key], + [] + ) + ]).set_lineno(lineno) \ No newline at end of file diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates.py b/src/ydata_profiling/report/presentation/flavours/html/templates.py index 85e24a46a..69a8dc1b8 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates.py +++ b/src/ydata_profiling/report/presentation/flavours/html/templates.py @@ -6,20 +6,33 @@ from ydata_profiling.config import Settings from ydata_profiling.report.formatters import fmt, fmt_badge, fmt_numeric, fmt_percent +from ydata_profiling.i18n import _ +from ydata_profiling.report.presentation.flavours.html.i18n_extension import I18nExtension # Initializing Jinja package_loader = jinja2.PackageLoader( "ydata_profiling", "report/presentation/flavours/html/templates" ) jinja2_env = jinja2.Environment( - lstrip_blocks=True, trim_blocks=True, loader=package_loader + lstrip_blocks=True, + trim_blocks=True, + loader=package_loader, + extensions=[I18nExtension] # Add internationalization extension ) + +# Adding translation function to Jinja2 environment +jinja2_env.globals['_'] = _ +jinja2_env.globals['gettext'] = _ + jinja2_env.filters["is_list"] = lambda x: isinstance(x, list) jinja2_env.filters["fmt_badge"] = fmt_badge jinja2_env.filters["fmt_percent"] = fmt_percent jinja2_env.filters["fmt_numeric"] = fmt_numeric jinja2_env.filters["fmt"] = fmt +# Add translation filter +jinja2_env.filters["trans"] = lambda key, **kwargs: _(key, **kwargs) + def template(template_name: str) -> jinja2.Template: """Get the template object given the name. diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html b/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html index e068278fb..d5d105a82 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html @@ -1,6 +1,6 @@ {% if tabs | length > 0 %} {% if oss %} -

Brought to you by YData

+

{{ _('report.brought_to_you_by') | safe }}

{% endif %}