diff --git a/docs/advanced_usage/custom_translations.md b/docs/advanced_usage/custom_translations.md new file mode 100644 index 000000000..99291a436 --- /dev/null +++ b/docs/advanced_usage/custom_translations.md @@ -0,0 +1,75 @@ +# 自定义翻译 + +ydata-profiling 支持用户自定义翻译文件,让您可以添加新的语言支持或修改现有翻译。 + +## 快速开始 +### 1. 安装 ydata-profiling +```bash +pip install ydata-profiling +``` + +### 2. 导出翻译模板 +#### 方法A:使用命令行工具生成模板 +```python +ydata-profiling-translate create-template -l en -o ./ +这会生成 en_template.json 文件 +``` +#### 方法B:使用Python代码生成模板 +```python +from ydata_profiling.i18n import export_translation_template + +# 导出英文模板作为翻译基础 +export_translation_template('en', 'translation_template.json') +``` + +### 3.编辑翻译文件 +#### 将模板文件重命名并翻译,例如创建 french.json: +```json +{ + "report": { + "title": "Rapport d'Analyse YData", + "overview": "Aperçu" + } +} +``` + +### 4. 在您的项目代码中使用 +```python +# 您的项目文件,例如 data_analysis.py +import pandas as pd +from ydata_profiling import ProfileReport +from ydata_profiling.i18n import load_translation_file, set_locale + +# 加载您创建的翻译文件 +load_translation_file('./french.json', 'fr') + +# 设置语言并生成报告 +set_locale('fr') +df = pd.read_csv('your_data.csv') +profile = ProfileReport(df, title="Mon Analyse de Données") +profile.to_file("rapport_francais.html") +``` + +### 项目结构示例 +#### 您的项目结构可能如下: +```bash +my_data_project/ +├── data/ +│ └── dataset.csv +├── translations/ # 您的翻译文件目录 +│ ├── zh.json +│ ├── french.json +│ └── german.json +├── analysis.py # 您的分析脚本 +└── requirements.txt +``` +#### 在 analysis.py 中: +```python +from ydata_profiling.i18n import add_translation_directory + +# 加载整个翻译目录 +add_translation_directory('./translations/') + +# 现在可以使用任何语言 +set_locale('zh') # 使用法语 +``` \ No newline at end of file diff --git a/docs/features/internationalization.md b/docs/features/internationalization.md new file mode 100644 index 000000000..3a0e1d64c --- /dev/null +++ b/docs/features/internationalization.md @@ -0,0 +1,40 @@ +# 国际化支持 + +ydata-profiling 现在支持多语言报告生成。 + +## 支持的语言 + +- 英文 (en) - 默认 +- 中文 (zh) + +## 使用方法 + +### 方法1:全局设置语言 + +```python +from ydata_profiling import ProfileReport +from ydata_profiling.i18n import set_locale + +# 设置中文 +set_locale('zh') +profile = ProfileReport(df) +profile.to_file('中文报告.html') +``` + +### 方法2:在初始化时指定语言 +```python +# 直接指定语言 +profile = ProfileReport(df, locale='zh', title='我的数据报告') +profile.to_file('报告.html') +``` +### 方法3:通过配置文件 +```python +# config.yaml +i18n: + locale: "zh" + auto_detect: false + fallback_locale: "en" +``` +```python +profile = ProfileReport(df, config_file='config.yaml') +``` diff --git a/examples/i18n/advanced_usage_example.py b/examples/i18n/advanced_usage_example.py new file mode 100644 index 000000000..3925940be --- /dev/null +++ b/examples/i18n/advanced_usage_example.py @@ -0,0 +1,1025 @@ +""" +Complete workflow example for custom translations in ydata-profiling +演示如何使用 ydata-profiling 的自定义翻译功能的完整工作流程 +""" +import pandas as pd +import json +import tempfile +import shutil +from pathlib import Path +from ydata_profiling import ProfileReport +from ydata_profiling.i18n import ( + export_translation_template, + load_translation_file, + add_translation_directory, + set_locale, + get_available_locales, + get_locale +) +from ydata_profiling.i18n import _ + + +def create_sample_data(): + """创建示例数据""" + print("📊 Creating sample dataset...") + + data = { + 'product_name': ['iPhone 14', 'Samsung Galaxy', 'Google Pixel', 'iPhone 14', 'OnePlus 10'], + 'price': [999, 899, 799, 999, 649], + 'category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics'], + 'rating': [4.5, 4.2, 4.0, 4.5, 3.8], + 'in_stock': [True, True, False, True, True], + 'release_date': ['2022-09-16', '2022-02-25', '2022-10-13', '2022-09-16', '2022-01-11'] + } + + df = pd.DataFrame(data) + print(f"✅ Sample dataset created with {len(df)} rows and {len(df.columns)} columns") + return df + + +def step1_export_template(): + """步骤1: 导出翻译模板""" + print("\n🔧 Step 1: Exporting translation template...") + + # 导出英文模板作为翻译基础 + template_file = "en_translation_template.json" + export_translation_template('en', template_file) + + print(f"✅ Translation template exported to: {template_file}") + + # 显示模板内容预览 + with open(template_file, 'r', encoding='utf-8') as f: + template_data = json.load(f) + + print("📋 Template preview (first few keys):") + print(json.dumps({k: v for k, v in list(template_data.items())[:2]}, indent=2)) + + return template_file + + +def step2_create_custom_translations(template_file): + """步骤2: 基于模板创建自定义翻译""" + print(f"\n🌍 Step 2: Creating custom translations based on {template_file}...") + + # 读取模板 + with open(template_file, 'r', encoding='utf-8') as f: + template = json.load(f) + + # 创建法语翻译 + french_translation = { + "report": { + "title": "Rapport de profilage YData", + "overview": "Aperçu", + "variables": "Variables", + "interactions": "Interactions", + "correlations": "Corrélations", + "missing_values": "Valeurs manquantes", + "sample": "Échantillon", + "duplicates": "Lignes dupliquées", + "footer_text": "Rapport généré par YData.", + "most_frequently_occurring": "Les plus fréquemment observés", + "columns": "Colonnes", + "more_details": "Plus de détails" + }, + "overview": { + "dataset_info": "Informations sur l'ensemble de données", + "variable_types": "Types de variables", + "dataset_statistics": "Statistiques de l'ensemble de données", + "number_of_variables": "Nombre de variables", + "number_of_observations": "Nombre d'observations", + "missing_cells": "Cellules manquantes", + "missing_cells_percentage": "Cellules manquantes (%)", + "duplicate_rows": "Lignes dupliquées", + "duplicate_rows_percentage": "Lignes dupliquées (%)", + "average_record_size": "Taille moyenne d'un enregistrement en mémoire" + }, + "variables": { + "numeric": "Numérique", + "categorical": "Catégorique", + "boolean": "Booléen", + "date": "Date", + "text": "Texte", + "url": "URL", + "path": "Chemin", + "image": "Image", + "distinct": "Distinct", + "distinct_percentage": "Distinct (%)", + "missing": "Manquant", + "missing_percentage": "Manquant (%)", + "statistics": "Statistiques", + "quantile_statistics": "Statistiques de quantiles", + "common_values": "Valeurs courantes", + "histogram": "Histogramme", + "mode": "Mode", + "standard_deviation": "Écart-type", + "sum": "Somme", + "mad": "Déviation absolue médiane (DAM)", + "coefficient_of_variation": "Coefficient de variation (CV)", + "kurtosis": "Kurtosis", + "skewness": "Asymétrie", + "range": "Étendue", + "interquartile_range": "Écart interquartile (IQR)", + "length": "Longueur", + "sample": "Échantillon" + }, + "correlations": { + "pearson": "Corrélation de Pearson (r)", + "spearman": "Corrélation de Spearman (ρ)", + "kendall": "Corrélation de Kendall (τ)", + "phi_k": "Phik (φk)", + "cramers": "V de Cramér (φc)", + "auto": "Automatique" + }, + "interactions": { + "scatter_plot": "Nuage de points", + "variable": "Variable" + }, + "missing_values": { + "matrix": "Matrice", + "bar_chart": "Graphique à barres", + "heatmap": "Carte thermique", + "dendrogram": "Dendrogramme" + }, + "alerts": { + "high_correlation": "Corrélation élevée", + "high_cardinality": "Cardinalité élevée", + "constant": "Constante", + "zeros": "Zéros", + "missing": "Manquant", + "skewed": "Asymétrique", + "type_date": "Date", + "uniform": "Uniforme", + "unique": "Unique", + "constant_length": "Longueur constante", + "duplicates": "Duplicatas", + "empty": "Vide", + "imbalance": "Déséquilibre", + "near_duplicates": "Quasi-duplicatas", + "non_stationary": "Non stationnaire", + "seasonal": "Saisonnier", + "truncated": "Tronqué", + "unsupported": "Non supporté", + "dirty_category": "Catégorie non propre" + }, + "formatting": { + "bytes": "{value} octets", + "kb": "{value} Ko", + "mb": "{value} Mo", + "gb": "{value} Go", + "percentage": "{value} %" + }, + "rendering": { + "generate_structure": "Générer la structure du rapport", + "html_progress": "Rendu HTML", + "json_progress": "Rendu JSON", + "widgets_progress": "Rendu des widgets", + "other_values_count": "Autres valeurs ({other_count})", + "missing": "(Manquant)" + }, + "core": { + "unknown": "inconnu", + "alerts": "Alertes", + "collapse": "Réduire", + "container": "Conteneur", + "correlationTable": "Tableau de corrélation", + "dropdown": "Menu déroulant", + "duplicate": "Duplicata", + "frequencyTable": "Tableau de fréquence", + "frequencyTableSmall": "Tableau de fréquence réduit", + "html": "HTML", + "image": "Image", + "sample": "Échantillon", + "scores": "Scores", + "table": "Tableau", + "toggle_button": "Bouton de bascule", + "variable": "Variable", + "variable_info": "Informations sur la variable", + "model": { + "bar_count": "Compte", + "bar_caption": "Une visualisation simple des valeurs nulles par colonne.", + "matrix": "Matrice", + "matrix_caption": "La matrice de nullité est une représentation dense des données qui permet de repérer rapidement visuellement les modèles de complétude des données.", + "heatmap": "Carte thermique", + "heatmap_caption": "La carte thermique de corrélation mesure la corrélation de nullité : à quel point la présence ou l'absence d'une variable affecte la présence d'une autre.", + "first_rows": "Premières lignes", + "last_rows": "Dernières lignes", + "random_sample": "Échantillon aléatoire" + }, + "structure": { + "correlations": "Corrélations", + "heatmap": "Carte thermique", + "table": "Tableau", + "overview": { + "values": "valeurs", + "number_variables": "Nombre de variables", + "number_observations": "Nombre d'observations", + "missing_cells": "Cellules manquantes", + "missing_cells_percentage": "Cellules manquantes (%)", + "duplicate_rows": "Lignes dupliquées", + "duplicate_rows_percentage": "Lignes dupliquées (%)", + "total_size_memory": "Taille totale en mémoire", + "average_record_memory": "Taille moyenne d'un enregistrement en mémoire", + "dataset_statistics": "Statistiques de l'ensemble de données", + "variable_types": "Types de variables", + "overview": "Aperçu", + "url": "URL", + "copyright": "Droits d'auteur", + "dataset": "Ensemble de données", + "analysis_started": "Analyse commencée", + "analysis_finished": "Analyse terminée", + "duration": "Durée", + "software_version": "Version du logiciel", + "download_configuration": "Télécharger la configuration", + "reproduction": "Reproduction", + "variable_descriptions": "Descriptions des variables", + "variables": "Variables", + "alerts_count": "Alertes ({count})", + "number_of_series": "Nombre de séries", + "timeseries_length": "Longueur de la série temporelle", + "starting_point": "Point de départ", + "ending_point": "Point de fin", + "period": "Période", + "timeseries_statistics": "Statistiques des séries temporelles", + "original": "Original", + "scaled": "Échelonné", + "time_series": "Séries temporelles", + "interactions": "Interactions", + "distinct": "Distinct", + "distinct_percentage": "Distinct (%)", + "missing": "Manquant", + "missing_percentage": "Manquant (%)", + "memory_size": "Taille en mémoire", + "file": "Fichier", + "size": "Taille", + "file_size": "Taille du fichier", + "file_size_caption": "Histogramme avec des intervalles fixes de tailles de fichiers (en octets)", + "unique": "Unique", + "unique_help": "Le nombre de valeurs uniques (toutes les valeurs qui n'apparaissent qu'une seule fois dans l'ensemble de données).", + "unique_percentage": "Unique (%)", + "max_length": "Longueur maximale", + "median_length": "Longueur médiane", + "mean_length": "Longueur moyenne", + "min_length": "Longueur minimale", + "length": "Longueur", + "length_histogram": "Histogramme de longueur", + "histogram_lengths_category": "Histogramme des longueurs de la catégorie", + "most_occurring_categories": "Catégories les plus fréquentes", + "most_frequent_character_per_category": "Caractère le plus fréquent par catégorie", + "most_occurring_scripts": "Scripts les plus fréquents", + "most_frequent_character_per_script": "Caractère le plus fréquent par script", + "most_occurring_blocks": "Blocs les plus fréquents", + "most_frequent_character_per_block": "Caractère le plus fréquent par bloc", + "total_characters": "Nombre total de caractères", + "distinct_characters": "Caractères distincts", + "distinct_categories": "Catégories distinctes", + "unicode_categories": "Catégories Unicode (cliquez pour plus d'informations)", + "distinct_scripts": "Scripts distincts", + "unicode_scripts": "Scripts Unicode (cliquez pour plus d'informations)", + "distinct_blocks": "Blocs distincts", + "unicode_blocks": "Blocs Unicode (cliquez pour plus d'informations)", + "characters_unicode": "Caractères et Unicode", + "characters_unicode_caption": "La norme Unicode attribue des propriétés à chaque point de code, qui peuvent être utilisées pour analyser des variables textuelles.", + "most_occurring_characters": "Caractères les plus fréquents", + "characters": "Caractères", + "categories": "Catégories", + "scripts": "Scripts", + "blocks": "Blocs", + "unicode": "Unicode", + "common_values": "Valeurs courantes", + "common_values_table": "Valeurs courantes (Tableau)", + "1st_row": "1ère ligne", + "2nd_row": "2ème ligne", + "3rd_row": "3ème ligne", + "4th_row": "4ème ligne", + "5th_row": "5ème ligne", + "categories_passes_threshold": "Nombre de catégories de variables dépassant le seuil (config.plot.cat_freq.max_unique)", + "common_values_plot": "Valeurs courantes (Graphique)", + "common_words": "Mots courants", + "wordcloud": "Nuage de mots", + "words": "Mots", + "mean": "Moyenne", + "min": "Minimum", + "max": "Maximum", + "zeros": "Zéros", + "zeros_percentage": "Zéros (%)", + "scatter": "Nuage", + "scatterplot": "Nuage de points", + "scatterplot_caption": "Nuage de points dans le plan complexe", + "mini_histogram": "Mini-histogramme", + "histogram": "Histogramme", + "histogram_caption": "Histogramme avec des intervalles fixes", + "extreme_values": "Valeurs extrêmes", + "histogram_s": "Histogramme(s)", + "invalid_dates": "Dates invalides", + "invalid_dates_percentage": "Dates invalides (%)", + "created": "Créé", + "accessed": "Accédé", + "modified": "Modifié", + "min_width": "Largeur minimale", + "median_width": "Largeur médiane", + "max_width": "Largeur maximale", + "min_height": "Hauteur minimale", + "median_height": "Hauteur médiane", + "max_height": "Hauteur maximale", + "min_area": "Surface minimale", + "median_area": "Surface médiane", + "max_area": "Surface maximale", + "scatter_plot_image_sizes": "Nuage de points des tailles d'image", + "scatter_plot": "Nuage de points", + "dimensions": "Dimensions", + "exif_keys": "Clés EXIF", + "exif_data": "Données EXIF", + "image": "Image", + "common_prefix": "Préfixe commun", + "unique_stems": "Racines uniques", + "unique_names": "Noms uniques", + "unique_extensions": "Extensions uniques", + "unique_directories": "Répertoires uniques", + "unique_anchors": "Ancres uniques", + "full": "Complet", + "stem": "Racine", + "name": "Nom", + "extension": "Extension", + "parent": "Parent", + "anchor": "Ancre", + "path": "Chemin", + "infinite": "Infini", + "infinite_percentage": "Infini (%)", + "Negative": "Négatif", + "Negative_percentage": "Négatif (%)", + "5_th_percentile": "5e centile", + "q1": "Q1", + "median": "Médiane", + "q3": "Q3", + "95_th_percentile": "95e centile", + "range": "Étendue", + "iqr": "Écart interquartile (IQR)", + "quantile_statistics": "Statistiques de quantiles", + "standard_deviation": "Écart-type", + "cv": "Coefficient de variation (CV)", + "kurtosis": "Kurtosis", + "mad": "Déviation absolue médiane (DAM)", + "skewness": "Asymétrie", + "sum": "Somme", + "variance": "Variance", + "monotonicity": "Monotonie", + "descriptive_statistics": "Statistiques descriptives", + "statistics": "Statistiques", + "augmented_dickey_fuller_test_value": "Valeur p du test de Dickey-Fuller augmenté", + "autocorrelation": "Autocorrélation", + "autocorrelation_caption": "ACF et PACF", + "timeseries": "Série temporelle", + "timeseries_plot": "Graphique de série temporelle", + "scheme": "Schéma", + "netloc": "Emplacement réseau", + "query": "Requête", + "fragment": "Fragment", + "heatmap": "Carte thermique" + } + } + }, + "html": { + "alerts": { + "title": "Alertes", + "not_present": "Aucune alerte présente dans cet ensemble de données", + "has_constant_value": "a une valeur constante", + "has_constant_length": "a une longueur constante", + "has_dirty_categories": "a des catégories non propres", + "has_high_cardinality": "a une cardinalité élevée", + "distinct_values": "valeurs distinctes", + "dataset_has": "L'ensemble de données a", + "duplicate_rows": "lignes dupliquées", + "dataset_is_empty": "L'ensemble de données est vide", + "is_highly": "est fortement", + "correlated_with": "corrélé avec", + "and": "et", + "other_fields": "autres champs", + "highly_imbalanced": "est fortement déséquilibré", + "has": "a", + "infinite_values": "valeurs infinies", + "missing_values": "valeurs manquantes", + "near_duplicate_rows": "lignes quasi-dupliquées", + "non_stationary": "est non stationnaire", + "seasonal": "est saisonnier", + "highly_skewed": "est fortement asymétrique", + "truncated_files": "fichiers tronqués", + "alert_type_date": "contient uniquement des valeurs datetime, mais est catégorique. Envisagez d'appliquer", + "uniformly_distributed": "est uniformément distribué", + "unique_values": "a des valeurs uniques", + "alert_unsupported": "est un type non supporté, vérifiez s'il nécessite un nettoyage ou une analyse supplémentaire", + "zeros": "zéros" + }, + "sequence": { + "overview_tabs": { + "brought_to_you_by": "Présenté par YData" + } + }, + "dropdown": "Sélectionner les colonnes", + "frequency_table": { + "value": "Valeur", + "count": "Compte", + "frequency_percentage": "Fréquence (%)", + "redacted_value": "Valeur masquée", + "no_values_found": "Aucune valeur trouvée" + }, + "scores": { + "overall_data_quality": "Score global de la qualité des données" + }, + "variable_info": { + "no_alerts": "Aucune alerte" + } + } + } + + # 创建西班牙语翻译 + spanish_translation = { + "report": { + "title": "Informe de Perfilado de YData", + "overview": "Resumen", + "variables": "Variables", + "interactions": "Interacciones", + "correlations": "Correlaciones", + "missing_values": "Valores faltantes", + "sample": "Muestra", + "duplicates": "Filas duplicadas", + "footer_text": "Informe generado por YData.", + "most_frequently_occurring": "Los más frecuentes", + "columns": "Columnas", + "more_details": "Más detalles" + }, + "overview": { + "dataset_info": "Información del conjunto de datos", + "variable_types": "Tipos de variables", + "dataset_statistics": "Estadísticas del conjunto de datos", + "number_of_variables": "Número de variables", + "number_of_observations": "Número de observaciones", + "missing_cells": "Celdas faltantes", + "missing_cells_percentage": "Celdas faltantes (%)", + "duplicate_rows": "Filas duplicadas", + "duplicate_rows_percentage": "Filas duplicadas (%)", + "average_record_size": "Tamaño promedio de registro en memoria" + }, + "variables": { + "numeric": "Numérico", + "categorical": "Categórico", + "boolean": "Booleano", + "date": "Fecha", + "text": "Texto", + "url": "URL", + "path": "Ruta", + "image": "Imagen", + "distinct": "Distinto", + "distinct_percentage": "Distinto (%)", + "missing": "Faltante", + "missing_percentage": "Faltante (%)", + "statistics": "Estadísticas", + "quantile_statistics": "Estadísticas de cuantiles", + "common_values": "Valores comunes", + "histogram": "Histograma", + "mode": "Moda", + "standard_deviation": "Desviación estándar", + "sum": "Suma", + "mad": "Desviación absoluta mediana (DAM)", + "coefficient_of_variation": "Coeficiente de variación (CV)", + "kurtosis": "Curtosis", + "skewness": "Asimetría", + "range": "Rango", + "interquartile_range": "Rango intercuartílico (IQR)", + "length": "Longitud", + "sample": "Muestra" + }, + "correlations": { + "pearson": "Correlación de Pearson (r)", + "spearman": "Correlación de Spearman (ρ)", + "kendall": "Correlación de Kendall (τ)", + "phi_k": "Phik (φk)", + "cramers": "V de Cramér (φc)", + "auto": "Automático" + }, + "interactions": { + "scatter_plot": "Gráfico de dispersión", + "variable": "Variable" + }, + "missing_values": { + "matrix": "Matriz", + "bar_chart": "Gráfico de barras", + "heatmap": "Mapa de calor", + "dendrogram": "Dendrograma" + }, + "alerts": { + "high_correlation": "Correlación alta", + "high_cardinality": "Alta cardinalidad", + "constant": "Constante", + "zeros": "Ceros", + "missing": "Faltante", + "skewed": "Asimétrico", + "type_date": "Fecha", + "uniform": "Uniforme", + "unique": "Único", + "constant_length": "Longitud constante", + "duplicates": "Duplicados", + "empty": "Vacío", + "imbalance": "Desequilibrio", + "near_duplicates": "Casi duplicados", + "non_stationary": "No estacionario", + "seasonal": "Estacional", + "truncated": "Truncado", + "unsupported": "No soportado", + "dirty_category": "Categoría sucia" + }, + "formatting": { + "bytes": "{value} bytes", + "kb": "{value} KB", + "mb": "{value} MB", + "gb": "{value} GB", + "percentage": "{value}%" + }, + "rendering": { + "generate_structure": "Generar estructura del informe", + "html_progress": "Renderizar HTML", + "json_progress": "Renderizar JSON", + "widgets_progress": "Renderizar widgets", + "other_values_count": "Otros valores ({other_count})", + "missing": "(Faltante)" + }, + "core": { + "unknown": "desconocido", + "alerts": "Alertas", + "collapse": "Colapsar", + "container": "Contenedor", + "correlationTable": "Tabla de correlación", + "dropdown": "Menú desplegable", + "duplicate": "Duplicado", + "frequencyTable": "Tabla de frecuencia", + "frequencyTableSmall": "Tabla de frecuencia pequeña", + "html": "HTML", + "image": "Imagen", + "sample": "Muestra", + "scores": "Puntuaciones", + "table": "Tabla", + "toggle_button": "Botón de alternancia", + "variable": "Variable", + "variable_info": "Información de la variable", + "model": { + "bar_count": "Conteo", + "bar_caption": "Una visualización simple de la nulidad por columna.", + "matrix": "Matriz", + "matrix_caption": "La matriz de nulidad es una representación densa de datos que permite identificar rápidamente patrones visuales en la completitud de los datos.", + "heatmap": "Mapa de calor", + "heatmap_caption": "El mapa de calor de correlación mide la correlación de nulidad: cómo la presencia o ausencia de una variable afecta la presencia de otra.", + "first_rows": "Primeras filas", + "last_rows": "Últimas filas", + "random_sample": "Muestra aleatoria" + }, + "structure": { + "correlations": "Correlaciones", + "heatmap": "Mapa de calor", + "table": "Tabla", + "overview": { + "values": "valores", + "number_variables": "Número de variables", + "number_observations": "Número de observaciones", + "missing_cells": "Celdas faltantes", + "missing_cells_percentage": "Celdas faltantes (%)", + "duplicate_rows": "Filas duplicadas", + "duplicate_rows_percentage": "Filas duplicadas (%)", + "total_size_memory": "Tamaño total en memoria", + "average_record_memory": "Tamaño promedio de registro en memoria", + "dataset_statistics": "Estadísticas del conjunto de datos", + "variable_types": "Tipos de variables", + "overview": "Resumen", + "url": "URL", + "copyright": "Derechos de autor", + "dataset": "Conjunto de datos", + "analysis_started": "Análisis iniciado", + "analysis_finished": "Análisis finalizado", + "duration": "Duración", + "software_version": "Versión del software", + "download_configuration": "Descargar configuración", + "reproduction": "Reproducción", + "variable_descriptions": "Descripciones de variables", + "variables": "Variables", + "alerts_count": "Alertas ({count})", + "number_of_series": "Número de series", + "timeseries_length": "Longitud de la serie temporal", + "starting_point": "Punto de inicio", + "ending_point": "Punto final", + "period": "Período", + "timeseries_statistics": "Estadísticas de series temporales", + "original": "Original", + "scaled": "Escalado", + "time_series": "Series temporales", + "interactions": "Interacciones", + "distinct": "Distinto", + "distinct_percentage": "Distinto (%)", + "missing": "Faltante", + "missing_percentage": "Faltante (%)", + "memory_size": "Tamaño en memoria", + "file": "Archivo", + "size": "Tamaño", + "file_size": "Tamaño del archivo", + "file_size_caption": "Histograma con intervalos fijos de tamaños de archivo (en bytes)", + "unique": "Único", + "unique_help": "El número de valores únicos (todos los valores que aparecen exactamente una vez en el conjunto de datos).", + "unique_percentage": "Único (%)", + "max_length": "Longitud máxima", + "median_length": "Longitud mediana", + "mean_length": "Longitud media", + "min_length": "Longitud mínima", + "length": "Longitud", + "length_histogram": "Histograma de longitud", + "histogram_lengths_category": "Histograma de longitudes de la categoría", + "most_occurring_categories": "Categorías más frecuentes", + "most_frequent_character_per_category": "Carácter más frecuente por categoría", + "most_occurring_scripts": "Scripts más frecuentes", + "most_frequent_character_per_script": "Carácter más frecuente por script", + "most_occurring_blocks": "Bloques más frecuentes", + "most_frequent_character_per_block": "Carácter más frecuente por bloque", + "total_characters": "Total de caracteres", + "distinct_characters": "Caracteres distintos", + "distinct_categories": "Categorías distintas", + "unicode_categories": "Categorías Unicode (haga clic para más información)", + "distinct_scripts": "Scripts distintos", + "unicode_scripts": "Scripts Unicode (haga clic para más información)", + "distinct_blocks": "Bloques distintos", + "unicode_blocks": "Bloques Unicode (haga clic para más información)", + "characters_unicode": "Caracteres y Unicode", + "characters_unicode_caption": "El estándar Unicode asigna propiedades a cada punto de código, que pueden usarse para analizar variables textuales.", + "most_occurring_characters": "Caracteres más frecuentes", + "characters": "Caracteres", + "categories": "Categorías", + "scripts": "Scripts", + "blocks": "Bloques", + "unicode": "Unicode", + "common_values": "Valores comunes", + "common_values_table": "Valores comunes (Tabla)", + "1st_row": "1ª fila", + "2nd_row": "2ª fila", + "3rd_row": "3ª fila", + "4th_row": "4ª fila", + "5th_row": "5ª fila", + "categories_passes_threshold": "Número de categorías de variables que superan el umbral (config.plot.cat_freq.max_unique)", + "common_values_plot": "Valores comunes (Gráfico)", + "common_words": "Palabras comunes", + "wordcloud": "Nube de palabras", + "words": "Palabras", + "mean": "Media", + "min": "Mínimo", + "max": "Máximo", + "zeros": "Ceros", + "zeros_percentage": "Ceros (%)", + "scatter": "Dispersión", + "scatterplot": "Gráfico de dispersión", + "scatterplot_caption": "Gráfico de dispersión en el plano complejo", + "mini_histogram": "Mini-histograma", + "histogram": "Histograma", + "histogram_caption": "Histograma con intervalos fijos", + "extreme_values": "Valores extremos", + "histogram_s": "Histograma(s)", + "invalid_dates": "Fechas inválidas", + "invalid_dates_percentage": "Fechas inválidas (%)", + "created": "Creado", + "accessed": "Accedido", + "modified": "Modificado", + "min_width": "Ancho mínimo", + "median_width": "Ancho mediano", + "max_width": "Ancho máximo", + "min_height": "Altura mínima", + "median_height": "Altura mediana", + "max_height": "Altura máxima", + "min_area": "Área mínima", + "median_area": "Área mediana", + "max_area": "Área máxima", + "scatter_plot_image_sizes": "Gráfico de dispersión de tamaños de imagen", + "scatter_plot": "Gráfico de dispersión", + "dimensions": "Dimensiones", + "exif_keys": "Claves EXIF", + "exif_data": "Datos EXIF", + "image": "Imagen", + "common_prefix": "Prefijo común", + "unique_stems": "Raíces únicas", + "unique_names": "Nombres únicos", + "unique_extensions": "Extensiones únicas", + "unique_directories": "Directorios únicos", + "unique_anchors": "Anclas únicas", + "full": "Completo", + "stem": "Raíz", + "name": "Nombre", + "extension": "Extensión", + "parent": "Padre", + "anchor": "Ancla", + "path": "Ruta", + "infinite": "Infinito", + "infinite_percentage": "Infinito (%)", + "Negative": "Negativo", + "Negative_percentage": "Negativo (%)", + "5_th_percentile": "Percentil 5", + "q1": "Q1", + "median": "Mediana", + "q3": "Q3", + "95_th_percentile": "Percentil 95", + "range": "Rango", + "iqr": "Rango intercuartílico (IQR)", + "quantile_statistics": "Estadísticas de cuantiles", + "standard_deviation": "Desviación estándar", + "cv": "Coeficiente de variación (CV)", + "kurtosis": "Curtosis", + "mad": "Desviación absoluta mediana (DAM)", + "skewness": "Asimetría", + "sum": "Suma", + "variance": "Varianza", + "monotonicity": "Monotonía", + "descriptive_statistics": "Estadísticas descriptivas", + "statistics": "Estadísticas", + "augmented_dickey_fuller_test_value": "Valor p del test de Dickey-Fuller aumentado", + "autocorrelation": "Autocorrelación", + "autocorrelation_caption": "ACF y PACF", + "timeseries": "Serie temporal", + "timeseries_plot": "Gráfico de serie temporal", + "scheme": "Esquema", + "netloc": "Ubicación de red", + "query": "Consulta", + "fragment": "Fragmento", + "heatmap": "Mapa de calor" + } + } + }, + "html": { + "alerts": { + "title": "Alertas", + "not_present": "No hay alertas presentes en este conjunto de datos", + "has_constant_value": "tiene un valor constante", + "has_constant_length": "tiene una longitud constante", + "has_dirty_categories": "tiene categorías sucias", + "has_high_cardinality": "tiene una alta cardinalidad", + "distinct_values": "valores distintos", + "dataset_has": "El conjunto de datos tiene", + "duplicate_rows": "filas duplicadas", + "dataset_is_empty": "El conjunto de datos está vacío", + "is_highly": "está altamente", + "correlated_with": "correlacionado con", + "and": "y", + "other_fields": "otros campos", + "highly_imbalanced": "está altamente desequilibrado", + "has": "tiene", + "infinite_values": "valores infinitos", + "missing_values": "valores faltantes", + "near_duplicate_rows": "filas casi duplicadas", + "non_stationary": "es no estacionario", + "seasonal": "es estacional", + "highly_skewed": "es altamente asimétrico", + "truncated_files": "archivos truncados", + "alert_type_date": "solo contiene valores de fecha y hora, pero es categórico. Considere aplicar", + "uniformly_distributed": "está uniformemente distribuido", + "unique_values": "tiene valores únicos", + "alert_unsupported": "es un tipo no soportado, verifique si necesita limpieza o análisis adicional", + "zeros": "ceros" + }, + "sequence": { + "overview_tabs": { + "brought_to_you_by": "Presentado por YData" + } + }, + "dropdown": "Seleccionar columnas", + "frequency_table": { + "value": "Valor", + "count": "Conteo", + "frequency_percentage": "Frecuencia (%)", + "redacted_value": "Valor redactado", + "no_values_found": "No se encontraron valores" + }, + "scores": { + "overall_data_quality": "Puntuación general de calidad de datos" + }, + "variable_info": { + "no_alerts": "Sin alertas" + } + } + } + + # 保存翻译文件 + french_file = "french_translation.json" + spanish_file = "spanish_translation.json" + + with open(french_file, 'w', encoding='utf-8') as f: + json.dump(french_translation, f, indent=2, ensure_ascii=False) + + with open(spanish_file, 'w', encoding='utf-8') as f: + json.dump(spanish_translation, f, indent=2, ensure_ascii=False) + + print(f"✅ French translation saved to: {french_file}") + print(f"✅ Spanish translation saved to: {spanish_file}") + + return french_file, spanish_file + + +def step3_single_file_loading(df, french_file): + """步骤3: 单个翻译文件加载示例""" + print(f"\n📁 Step 3: Loading single translation file - {french_file}") + + # 加载法语翻译 + load_translation_file(french_file, 'fr') + + print(f"📋 Available locales after loading: {get_available_locales()}") + + # 设置为法语并生成报告 + set_locale('fr') + print(f"🌍 Current locale set to: {get_locale()}") + + profile = ProfileReport(df, title="Rapport d'Analyse des Produits") + output_file = "product_analysis_french.html" + + # 强制覆盖生成报告 + try: + profile.to_file(output_file) + print(f"✅ French report generated: {output_file}") + except Exception as e: + print(f"⚠️ Warning generating French report: {e}") + # 如果报告生成失败,删除已存在的文件再重试 + if Path(output_file).exists(): + Path(output_file).unlink() + profile.to_file(output_file) + print(f"✅ French report generated (after cleanup): {output_file}") + + return output_file + + +def step4_directory_loading(df, french_file, spanish_file): + """步骤4: 翻译目录加载示例""" + print(f"\n📂 Step 4: Loading translation directory") + + # 创建翻译目录 + translations_dir = Path("custom_translations") + translations_dir.mkdir(exist_ok=True) + + # 移动翻译文件到目录 + french_target = translations_dir / "fr.json" + spanish_target = translations_dir / "es.json" + + # 复制文件而不是移动,避免文件已存在的错误 + try: + shutil.copy2(french_file, french_target) + print(f"📄 Copied {french_file} to {french_target}") + except Exception as e: + print(f"⚠️ Warning copying French file: {e}") + # 如果复制失败,直接覆盖 + shutil.copyfile(french_file, french_target) + + try: + shutil.copy2(spanish_file, spanish_target) + print(f"📄 Copied {spanish_file} to {spanish_target}") + except Exception as e: + print(f"⚠️ Warning copying Spanish file: {e}") + # 如果复制失败,直接覆盖 + shutil.copyfile(spanish_file, spanish_target) + + print(f"📁 Created translation directory: {translations_dir}") + print(f"📄 Files in directory: {list(translations_dir.glob('*.json'))}") + + # 加载整个翻译目录 + add_translation_directory(translations_dir) + + print(f"📋 Available locales after directory loading: {get_available_locales()}") + + # 生成西班牙语报告 + set_locale('es') + print(f"🌍 Current locale set to: {get_locale()}") + + profile = ProfileReport(df, title="Informe de Análisis de Productos") + output_file = "product_analysis_spanish.html" + + # 强制覆盖生成报告 + try: + profile.to_file(output_file) + print(f"✅ Spanish report generated: {output_file}") + except Exception as e: + print(f"⚠️ Warning generating Spanish report: {e}") + # 如果报告生成失败,删除已存在的文件再重试 + if Path(output_file).exists(): + Path(output_file).unlink() + profile.to_file(output_file) + print(f"✅ Spanish report generated (after cleanup): {output_file}") + + return output_file, translations_dir + + +def step5_using_locale_parameter(df): + """步骤5: 使用ProfileReport的locale参数""" + print(f"\n⚙️ Step 5: Using ProfileReport locale parameter") + + # 直接在ProfileReport中指定语言 + print("🔄 Generating report with locale='zh' parameter...") + profile_zh = ProfileReport(df, title="产品分析报告", locale='zh') + output_file = "product_analysis_chinese.html" + + # 强制覆盖生成报告 + try: + profile_zh.to_file(output_file) + print(f"✅ Chinese report generated: {output_file}") + except Exception as e: + print(f"⚠️ Warning generating Chinese report: {e}") + # 如果报告生成失败,删除已存在的文件再重试 + if Path(output_file).exists(): + Path(output_file).unlink() + profile_zh.to_file(output_file) + print(f"✅ Chinese report generated (after cleanup): {output_file}") + + print(f"🌍 Current global locale remains: {get_locale()}") + + return output_file + + +def cleanup_files(files_to_clean): + """清理生成的文件""" + print(f"\n🧹 Cleaning up generated files...") + + for file_path in files_to_clean: + try: + if isinstance(file_path, str): + file_path = Path(file_path) + + if file_path.exists(): + if file_path.is_file(): + file_path.unlink() + elif file_path.is_dir(): + shutil.rmtree(file_path) + print(f"🗑️ Removed: {file_path}") + except Exception as e: + print(f"⚠️ Could not remove {file_path}: {e}") + + +def safe_file_operation(operation_func, *args, **kwargs): + """安全执行文件操作,包含重试逻辑""" + max_retries = 3 + for attempt in range(max_retries): + try: + return operation_func(*args, **kwargs) + except Exception as e: + if attempt < max_retries - 1: + print(f"⚠️ Attempt {attempt + 1} failed: {e}. Retrying...") + import time + time.sleep(0.5) # 短暂等待 + else: + print(f"❌ All attempts failed: {e}") + raise + + +def main(): + """主函数 - 演示完整的翻译工作流程""" + print("🚀 YData Profiling Custom Translation Workflow Example") + print("=" * 60) + + # 记录要清理的文件 + files_to_clean = [] + + try: + # 创建示例数据 + df = create_sample_data() + + # 步骤1: 导出模板 + template_file = step1_export_template() + files_to_clean.append(template_file) + + # 步骤2: 创建自定义翻译 + french_file, spanish_file = step2_create_custom_translations(template_file) + files_to_clean.extend([french_file, spanish_file]) + + # 步骤3: 单文件加载 + french_report = safe_file_operation(step3_single_file_loading, df, french_file) + files_to_clean.append(french_report) + + # 步骤4: 目录加载 + spanish_report, translations_dir = safe_file_operation(step4_directory_loading, df, french_file, spanish_file) + files_to_clean.extend([spanish_report, translations_dir]) + + # 步骤5: 使用locale参数 + chinese_report = safe_file_operation(step5_using_locale_parameter, df) + files_to_clean.append(chinese_report) + + print(f"\n🎉 All steps completed successfully!") + print(f"📊 Generated reports:") + print(f" - {french_report} (French)") + print(f" - {spanish_report} (Spanish)") + print(f" - {chinese_report} (Chinese)") + print(f"\n💡 You can open these HTML files in your browser to see the translated reports.") + + except Exception as e: + print(f"❌ Error occurred: {e}") + import traceback + traceback.print_exc() + + finally: + # 询问是否清理文件 + try: + response = input(f"\n🤔 Do you want to clean up generated files? (y/N): ").strip().lower() + if response in ['y', 'yes']: + cleanup_files(files_to_clean) + else: + print("📁 Files kept for your review.") + print("💡 Tip: You can run this script multiple times to see the overwrites working.") + except KeyboardInterrupt: + print(f"\n📁 Files kept for your review.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/i18n/i18n_example.py b/examples/i18n/i18n_example.py new file mode 100644 index 000000000..faca0a566 --- /dev/null +++ b/examples/i18n/i18n_example.py @@ -0,0 +1,42 @@ +""" +Example of using ydata-profiling with internationalization +""" +import pandas as pd +from ydata_profiling import ProfileReport +from ydata_profiling.i18n import set_locale +from ydata_profiling.utils.locale_utils import auto_set_locale + +# Create sample data +df = pd.DataFrame({ + 'numeric_column': [1, 2, 3, 4, 5], + 'categorical_column': ['A', 'B', 'A', 'C', 'B'], + 'text_column': ['Hello', 'World', 'Test', 'Data', 'Science'] +}) + +# Use the default report generation method +print("Default report generation report...") +profile_default = ProfileReport(df, title="Default Data Profiling Report") +profile_default.to_file("default_report.html") + +# Auto-detect and set language +print("Auto-detect generation report...") +auto_set_locale() +profile_zh = ProfileReport(df, title="Auto Detect Data Profiling Report") +profile_zh.to_file("auto_report_chinese.html") + +# Generate a report in English +print("Generating English report...") +set_locale('en') +profile_en = ProfileReport(df, title="English Data Profiling Report") +profile_en.to_file("report_english.html") + +# Generate a report in Chinese +print("Generating Chinese report...") +set_locale('zh') +profile_zh = ProfileReport(df, title="中文数据分析报告") +profile_zh.to_file("report_chinese.html") + +# Specify the language during initialization +print("Generating report with locale parameter...") +profile_locale = ProfileReport(df, title="报告标题", locale='zh') +profile_locale.to_file("report_with_locale.html") \ No newline at end of file diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..d02b677df 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -5,6 +5,7 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr +from ydata_profiling.i18n import set_locale, get_locale def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: @@ -290,6 +291,14 @@ class Report(BaseModel): precision: int = 8 +# Add a new configuration item in the Settings class +class I18n(BaseModel): + """Internationalization settings""" + locale: str = "en" + auto_detect: bool = True + fallback_locale: str = "en" + + class Settings(BaseSettings): # Default prefix to avoid collisions with environment variables class Config: @@ -355,6 +364,24 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() + # Add internationalization configuration + i18n: I18n = I18n() + + def __init__(self, **data): + # Check the current language setting before initialization + current_locale = get_locale() + + super().__init__(**data) + + # If no locale is explicitly specified and there is currently a non-default language setting, + # maintain the current setting + if 'i18n' not in data and current_locale != 'en': + self.i18n.locale = current_locale + + # Set locale + if self.i18n.locale: + set_locale(self.i18n.locale) + def update(self, updates: dict) -> "Settings": update = _merge_dictionaries(self.dict(), updates) return self.parse_obj(self.copy(update=update)) @@ -368,10 +395,21 @@ def from_file(config_file: Union[Path, str]) -> "Settings": Returns: Settings """ + # Save current language settings + current_locale = get_locale() + with open(config_file) as f: data = yaml.safe_load(f) - return Settings.parse_obj(data) + settings = Settings.parse_obj(data) + + # If no language is specified in the configuration file and there is currently a non-default language setting, + # maintain the current setting + if 'i18n' not in data and current_locale != 'en': + settings.i18n.locale = current_locale + set_locale(current_locale) + + return settings class SparkSettings(Settings): diff --git a/src/ydata_profiling/config_default.yaml b/src/ydata_profiling/config_default.yaml index a72b8bdc2..cddd5d1f2 100644 --- a/src/ydata_profiling/config_default.yaml +++ b/src/ydata_profiling/config_default.yaml @@ -221,3 +221,8 @@ html: - "#198754" full_width: false + +i18n: + locale: "en" + auto_detect: true + fallback_locale: "en" \ No newline at end of file diff --git a/src/ydata_profiling/config_minimal.yaml b/src/ydata_profiling/config_minimal.yaml index a3cb46211..dd364e7be 100644 --- a/src/ydata_profiling/config_minimal.yaml +++ b/src/ydata_profiling/config_minimal.yaml @@ -220,3 +220,8 @@ html: - "#198754" full_width: false + +i18n: + locale: "en" + auto_detect: true + fallback_locale: "en" \ No newline at end of file diff --git a/src/ydata_profiling/i18n/__init__.py b/src/ydata_profiling/i18n/__init__.py new file mode 100644 index 000000000..eb414bbbd --- /dev/null +++ b/src/ydata_profiling/i18n/__init__.py @@ -0,0 +1,249 @@ +""" +Internationalization module for ydata-profiling +""" +import os +import json +from pathlib import Path +from typing import Dict, Optional, List, Union +import threading + +class TranslationManager: + """Manages translations for ydata-profiling with support for external translation files""" + + _instance = None + _lock = threading.Lock() + + def __new__(cls): + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if not hasattr(self, 'initialized'): + self.translations: Dict[str, Dict[str, str]] = {} + self.current_locale = 'en' + self.fallback_locale = 'en' + self.external_translation_dirs: List[Path] = [] + self.initialized = True + self._load_translations() + + def add_translation_directory(self, directory: Union[str, Path]): + """Add external translation directory + + Args: + directory: Path to directory containing translation JSON files + """ + dir_path = Path(directory) + if dir_path.exists() and dir_path.is_dir(): + if dir_path not in self.external_translation_dirs: + self.external_translation_dirs.append(dir_path) + self._load_external_translations(dir_path) + else: + print(f"Warning: Translation directory {directory} does not exist") + + def load_translation_file(self, file_path: Union[str, Path], locale: Optional[str] = None): + """Load a specific translation file + + Args: + file_path: Path to the translation JSON file + locale: Locale code. If None, will be inferred from filename + """ + file_path = Path(file_path) + if not file_path.exists(): + print(f"Warning: Translation file {file_path} does not exist") + return + + if locale is None: + locale = file_path.stem + + try: + with open(file_path, 'r', encoding='utf-8') as f: + external_translations = json.load(f) + + # Merge with existing translations + if locale in self.translations: + self.translations[locale] = self._merge_translations( + self.translations[locale], + external_translations + ) + else: + self.translations[locale] = external_translations + + print(f"Successfully loaded translation file for locale '{locale}' from {file_path}") + except Exception as e: + print(f"Warning: Failed to load translation file {file_path}: {e}") + + def _merge_translations(self, base: dict, override: dict) -> dict: + """Recursively merge translation dictionaries""" + result = base.copy() + for key, value in override.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = self._merge_translations(result[key], value) + else: + result[key] = value + return result + + def _load_translations(self): + """Load built-in translation files""" + translations_dir = Path(__file__).parent / 'locales' + if translations_dir.exists(): + self._load_translations_from_directory(translations_dir) + + def _load_external_translations(self, directory: Path): + """Load translations from external directory""" + self._load_translations_from_directory(directory) + + def _load_translations_from_directory(self, directory: Path): + """Load all translation files from a directory""" + for locale_file in directory.glob('*.json'): + locale = locale_file.stem + try: + with open(locale_file, 'r', encoding='utf-8') as f: + translations = json.load(f) + + if locale in self.translations: + # Merge with existing translations + self.translations[locale] = self._merge_translations( + self.translations[locale], + translations + ) + else: + self.translations[locale] = translations + + except Exception as e: + print(f"Warning: Failed to load translation file {locale_file}: {e}") + + def get_available_locales(self) -> List[str]: + """Get list of available locales""" + return list(self.translations.keys()) + + def set_locale(self, locale: str): + """Set the current locale""" + if locale in self.translations or locale == self.fallback_locale: + self.current_locale = locale + else: + print(f"Warning: Locale '{locale}' not found, using fallback '{self.fallback_locale}'") + print(f"Available locales: {self.get_available_locales()}") + + def get_translation(self, key: str, locale: Optional[str] = None, **kwargs) -> str: + """Get translation for a key""" + target_locale = locale or self.current_locale + + # Try current locale + if target_locale in self.translations: + translation = self._get_nested_value(self.translations[target_locale], key) + if translation: + return self._format_translation(translation, **kwargs) + + # Try fallback locale + if target_locale != self.fallback_locale and self.fallback_locale in self.translations: + translation = self._get_nested_value(self.translations[self.fallback_locale], key) + if translation: + return self._format_translation(translation, **kwargs) + + # Return key if no translation found + return key + + def _get_nested_value(self, data: dict, key: str) -> Optional[str]: + """Get nested value from dictionary using dot notation""" + keys = key.split('.') + current = data + for k in keys: + if isinstance(current, dict) and k in current: + current = current[k] + else: + return None + return current if isinstance(current, str) else None + + def _format_translation(self, translation: str, **kwargs) -> str: + """Format translation with parameters""" + try: + return translation.format(**kwargs) + except (KeyError, ValueError): + return translation + + def export_template(self, locale: str, output_file: Union[str, Path]): + """Export translation template for a specific locale + + Args: + locale: Source locale to export (usually 'en') + output_file: Output file path + """ + if locale not in self.translations: + print(f"Warning: Locale '{locale}' not found") + return + + output_path = Path(output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(self.translations[locale], f, indent=2, ensure_ascii=False) + + print(f"Translation template exported to {output_path}") + +# Global translation manager instance +_translation_manager = TranslationManager() + +def set_locale(locale: str): + """Set the global locale""" + _translation_manager.set_locale(locale) + +def get_locale() -> str: + """Get the current locale""" + return _translation_manager.current_locale + +def add_translation_directory(directory: Union[str, Path]): + """Add external translation directory""" + _translation_manager.add_translation_directory(directory) + +def load_translation_file(file_path: Union[str, Path], locale: Optional[str] = None): + """Load a specific translation file""" + _translation_manager.load_translation_file(file_path, locale) + +def get_available_locales() -> List[str]: + """Get list of available locales""" + return _translation_manager.get_available_locales() + +def export_translation_template(locale: str = 'en', output_file: Union[str, Path] = 'translation_template.json'): + """Export translation template for customization""" + _translation_manager.export_template(locale, output_file) + +def _(key: str, default: Optional[str] = None, **kwargs) -> str: + """Translation function with optional default fallback + + Args: + key: Translation key in dot notation (e.g., 'report.title') + default: Default value to return if translation is not found + **kwargs: Parameters for string formatting + + Returns: + Translated string, default value, or the key itself if no translation found + """ + translation = _translation_manager.get_translation(key, **kwargs) + + # If the translation key is not found and a default value is provided, use the default value + if translation == key and default is not None: + return default + + return translation + +def t(key: str, **kwargs) -> str: + """Translation function - alias for _() + + Args: + key: Translation key in dot notation + **kwargs: Parameters for string formatting + + Returns: + Translated string + """ + return _(key, **kwargs) + +# Export main functions +__all__ = [ + 'set_locale', 'get_locale', '_', 't', 'TranslationManager', + 'add_translation_directory', 'load_translation_file', + 'get_available_locales', 'export_translation_template' +] \ No newline at end of file diff --git a/src/ydata_profiling/i18n/locales/en.json b/src/ydata_profiling/i18n/locales/en.json new file mode 100644 index 000000000..f0698a5da --- /dev/null +++ b/src/ydata_profiling/i18n/locales/en.json @@ -0,0 +1,364 @@ +{ + "report": { + "title": "YData Profiling Report", + "overview": "Overview", + "variables": "Variables", + "interactions": "Interactions", + "correlations": "Correlations", + "missing_values": "Missing values", + "sample": "Sample", + "duplicates": "Duplicate rows", + "footer_text": "Report generated by YData.", + "most_frequently_occurring": "Most frequently occurring", + "columns": "Columns", + "more_details": "More details" + }, + "overview": { + "dataset_info": "Dataset info", + "variable_types": "Variable types", + "dataset_statistics": "Dataset statistics", + "number_of_variables": "Number of variables", + "number_of_observations": "Number of observations", + "missing_cells": "Missing cells", + "missing_cells_percentage": "Missing cells (%)", + "duplicate_rows": "Duplicate rows", + "duplicate_rows_percentage": "Duplicate rows (%)", + "average_record_size": "Average record size in memory" + }, + "variables": { + "numeric": "Numeric", + "categorical": "Categorical", + "boolean": "Boolean", + "date": "Date", + "text": "Text", + "url": "URL", + "path": "Path", + "image": "Image", + "distinct": "Distinct", + "distinct_percentage": "Distinct (%)", + "missing": "Missing", + "missing_percentage": "Missing (%)", + "statistics": "Statistics", + "quantile_statistics": "Quantile statistics", + "common_values": "Common values", + "histogram": "Histogram", + "mode": "Mode", + "standard_deviation": "Standard deviation", + "sum": "Sum", + "mad": "Median Absolute Deviation (MAD)", + "coefficient_of_variation": "Coefficient of variation (CV)", + "kurtosis": "Kurtosis", + "skewness": "Skewness", + "range": "Range", + "interquartile_range": "Interquartile range (IQR)", + "length": "Length", + "sample": "Sample" + }, + "correlations": { + "pearson": "Pearson's r", + "spearman": "Spearman's ρ", + "kendall": "Kendall's τ", + "phi_k": "Phik (φk)", + "cramers": "Cramér's V (φc)", + "auto": "Auto" + }, + "interactions": { + "scatter_plot": "Scatter plot", + "variable": "Variable" + }, + "missing_values": { + "matrix": "Matrix", + "bar_chart": "Bar chart", + "heatmap": "Heatmap", + "dendrogram": "Dendrogram" + }, + "alerts": { + "high_correlation": "High correlation", + "high_cardinality": "High cardinality", + "constant": "Constant", + "zeros": "Zeros", + "missing": "Missing", + "skewed": "Skewed", + "type_date": "Date", + "uniform": "Uniform", + "unique": "Unique", + "constant_length": "Constant length", + "duplicates": "Duplicates", + "empty": "Empty", + "imbalance": "Imbalance", + "near_duplicates": "Near duplicates", + "non_stationary": "Non stationary", + "seasonal": "Seasonal", + "truncated": "Truncated", + "unsupported": "Unsupported", + "dirty_category": "Dirty category" + }, + "formatting": { + "bytes": "{value} bytes", + "kb": "{value} KB", + "mb": "{value} MB", + "gb": "{value} GB", + "percentage": "{value}%" + }, + "rendering": { + "generate_structure": "Generate report structure", + "html_progress": "Render HTML", + "json_progress": "Render JSON", + "widgets_progress": "Render widgets", + "other_values_count": "Other values ({other_count})", + "missing": "(Missing)" + }, + "core": { + "unknown": "unknown", + "alerts": "Alerts", + "collapse": "Collapse", + "container": "Container", + "correlationTable": "CorrelationTable", + "dropdown": "Dropdown", + "duplicate": "Duplicate", + "frequencyTable": "FrequencyTable", + "frequencyTableSmall": "FrequencyTableSmall", + "html": "HTML", + "image": "Image", + "sample": "Sample", + "scores": "Scores", + "table": "Table", + "toggle_button": "ToggleButton", + "variable": "Variable", + "variable_info": "VariableInfo", + "model": { + "bar_count": "Count", + "bar_caption": "A simple visualization of nullity by column.", + "matrix": "Matrix", + "matrix_caption":"Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.", + "heatmap": "Heatmap", + "heatmap_caption": "The correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another.", + "first_rows": "First rows", + "last_rows": "Last rows", + "random_sample": "Random sample" + }, + "structure": { + "correlations": "Correlations", + "heatmap": "Heatmap", + "table": "Table", + "overview": { + "values": "values", + "number_variables": "Number of variables", + "number_observations": "Number of observations", + "missing_cells": "Missing cells", + "missing_cells_percentage": "Missing cells (%)", + "duplicate_rows": "Duplicate rows", + "duplicate_rows_percentage": "Duplicate rows (%)", + "total_size_memory": "Total size in memory", + "average_record_memory": "Average record size in memory", + "dataset_statistics": "Dataset statistics", + "variable_types": "Variable types", + "overview": "Overview", + "url": "URL", + "copyright": "Copyright", + "dataset": "Dataset", + "analysis_started": "Analysis started", + "analysis_finished": "Analysis finished", + "duration": "Duration", + "software_version": "Software version", + "download_configuration": "Download configuration", + "reproduction": "Reproduction", + "variable_descriptions": "Variable descriptions", + "variables": "Variables", + "alerts_count": "Alerts ({count})", + "number_of_series": "Number of series", + "timeseries_length": "Time series length", + "starting_point": "Starting point", + "ending_point": "Ending point", + "period": "Period", + "timeseries_statistics": "Timeseries statistics", + "original": "Original", + "scaled": "Scaled", + "time_series": "Time Series", + "interactions": "Interactions", + "distinct": "Distinct", + "distinct_percentage": "Distinct (%)", + "missing": "Missing", + "missing_percentage": "Missing (%)", + "memory_size": "Memory size", + "file": "File", + "size": "Size", + "file_size": "File size", + "file_size_caption":"Histogram with fixed size bins of file sizes (in bytes)", + "unique": "Unique", + "unique_help": "The number of unique values (all values that occur exactly once in the dataset).", + "unique_percentage": "Unique (%)", + "max_length": "Max length", + "median_length": "Median length", + "mean_length": "Mean length", + "min_length": "Min length", + "length": "Length", + "length_histogram": "length histogram", + "histogram_lengths_category": "Histogram of lengths of the category", + "most_occurring_categories": "Most occurring categories", + "most_frequent_character_per_category": "Most frequent character per category", + "most_occurring_scripts": "Most occurring scripts", + "most_frequent_character_per_script": "Most frequent character per script", + "most_occurring_blocks": "Most occurring blocks", + "most_frequent_character_per_block": "Most frequent character per block", + "total_characters": "Total characters", + "distinct_characters": "Distinct characters", + "distinct_categories": "Distinct categories", + "unicode_categories": "Unicode categories (click for more information)", + "distinct_scripts": "Distinct scripts", + "unicode_scripts": "Unicode scripts (click for more information)", + "distinct_blocks": "Distinct blocks", + "unicode_blocks": "Unicode blocks (click for more information)", + "characters_unicode": "Characters and Unicode", + "characters_unicode_caption": "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.", + "most_occurring_characters": "Most occurring characters", + "characters": "Characters", + "categories": "Categories", + "scripts": "Scripts", + "blocks": "Blocks", + "unicode": "Unicode", + "common_values": "Common Values", + "common_values_table": "Common Values (Table)", + "1st_row": "1st row", + "2nd_row": "2nd row", + "3rd_row":"3rd row", + "4th_row": "4th row", + "5th_row": "5th row", + "categories_passes_threshold ": "Number of variable categories passes threshold (config.plot.cat_freq.max_unique)", + "common_values_plot": "Common Values (Plot)", + "common_words": "Common words", + "wordcloud": "Wordcloud", + "words": "Words", + "mean": "Mean", + "min": "Minimum", + "max": "Maximum", + "zeros": "Zeros", + "zeros_percentage": "Zeros (%)", + "scatter": "Scatter", + "scatterplot": "Scatterplot", + "scatterplot_caption": "Scatterplot in the complex plane", + "mini_histogram": "Mini histogram", + "histogram": "Histogram", + "histogram_caption": "Histogram with fixed size bins", + "extreme_values": "Extreme values", + "histogram_s": "Histogram(s)", + "invalid_dates": "Invalid dates", + "invalid_dates_percentage": "Invalid dates (%)", + "created": "Created", + "accessed": "Accessed", + "modified": "Modified", + "min_width": "Min width", + "median_width": "Median width", + "max_width": "Max width", + "min_height": "Min height", + "median_height": "Median height", + "max_height": "Max height", + "min_area": "Min area", + "median_area": "Median area", + "max_area": "Max area", + "scatter_plot_image_sizes": "Scatter plot of image sizes", + "scatter_plot":"Scatter plot", + "dimensions": "Dimensions", + "exif_keys": "Exif keys", + "exif_data": "Exif data", + "image": "Image", + "common_prefix": "Common prefix", + "unique_stems": "Unique stems", + "unique_names": "Unique names", + "unique_extensions": "Unique extensions", + "unique_directories": "Unique directories", + "unique_anchors": "Unique anchors", + "full": "Full", + "stem": "Stem", + "name": "Name", + "extension": "Extension", + "parent": "Parent", + "anchor": "Anchor", + "path": "Path", + "infinite": "Infinite", + "infinite_percentage": "Infinite (%)", + "Negative": "Negative", + "Negative_percentage": "Negative (%)", + "5_th_percentile": "5-th percentile", + "q1": "Q1", + "median": "median", + "q3": "Q3", + "95_th_percentile": "95-th percentile", + "range": "Range", + "iqr": "Interquartile range (IQR)", + "quantile_statistics": "Quantile statistics", + "standard_deviation": "Standard deviation", + "cv": "Coefficient of variation (CV)", + "kurtosis": "Kurtosis", + "mad": "Median Absolute Deviation (MAD)", + "skewness": "Skewness", + "sum": "Sum", + "variance": "Variance", + "monotonicity": "Monotonicity", + "descriptive_statistics": "Descriptive statistics", + "statistics": "Statistics", + "augmented_dickey_fuller_test_value": "Augmented Dickey-Fuller test p-value", + "autocorrelation": "Autocorrelation", + "autocorrelation_caption": "ACF and PACF", + "timeseries": "Time-series", + "timeseries_plot": "Time-series plot", + "scheme": "Scheme", + "netloc": "Netloc", + "query": "Query", + "fragment": "Fragment", + "heatmap": "Heatmap" + } + } + }, + "html": { + "alerts": { + "title": "Alerts", + "not_present": "Alert not present in this dataset", + "has_constant_value": "has constant value", + "has_constant_length": "has constant length", + "has_dirty_categories": "has dirty categories", + "has_high_cardinality": "has a high cardinality", + "distinct_values": "distinct values", + "dataset_has": "Dataset has", + "duplicate_rows": "duplicate rows", + "dataset_is_empty": "Dataset is empty", + "is_highly": "is highly", + "correlated_with": "correlated with", + "and": "and", + "other_fields": "other fields", + "highly_imbalanced": "is highly imbalanced", + "has": "has", + "infinite_values": "infinite values", + "missing_values": "missing values", + "near_duplicate_rows": "near duplicate rows", + "non_stationary": "is non stationary", + "seasonal": "is seasonal", + "highly_skewed": "is highly skewed", + "truncated_files": "truncated files", + "alert_type_date": "only contains datetime values, but is categorical. Consider applying", + "uniformly_distributed": "is uniformly distributed", + "unique_values": "has unique values", + "alert_unsupported": "is an unsupported type, check if it needs cleaning or further analysis", + "zeros": "zeros" + }, + "sequence": { + "overview_tabs": { + "brought_to_you_by": "Brought to you by YData" + } + }, + "dropdown": "Select Columns", + "frequency_table":{ + "value": "Value", + "count": "Count", + "frequency_percentage": "Frequency (%)", + "redacted_value": "Redacted value", + "no_values_found": "No values found" + }, + "scores": { + "overall_data_quality": "Overall Data Quality Score" + }, + "variable_info": { + "no_alerts": "No alerts" + } + } +} \ No newline at end of file diff --git a/src/ydata_profiling/i18n/locales/zh.json b/src/ydata_profiling/i18n/locales/zh.json new file mode 100644 index 000000000..14657255d --- /dev/null +++ b/src/ydata_profiling/i18n/locales/zh.json @@ -0,0 +1,363 @@ +{ + "report": { + "title": "YData 数据分析报告", + "overview": "概览", + "variables": "变量", + "interactions": "交互分析", + "correlations": "相关性", + "missing_values": "缺失值", + "sample": "样本数据", + "duplicates": "重复数据", + "footer_text": "报告由 YData 生成。", + "most_frequently_occurring": "出现频率最高的", + "columns": "列", + "more_details": "更多详情" + }, + "overview": { + "dataset_info": "数据集信息", + "variable_types": "变量类型", + "dataset_statistics": "数据集统计", + "number_of_variables": "变量数量", + "number_of_observations": "观测数量", + "missing_cells": "缺失单元格", + "missing_cells_percentage": "缺失单元格百分比", + "duplicate_rows": "重复行", + "duplicate_rows_percentage": "重复行百分比", + "average_record_size": "平均记录内存大小" + }, + "variables": { + "numeric": "数值型", + "categorical": "分类型", + "boolean": "布尔型", + "date": "日期型", + "text": "文本型", + "url": "网址型", + "path": "路径型", + "image": "图像型", + "distinct": "唯一值", + "distinct_percentage": "唯一值百分比", + "missing": "缺失值", + "missing_percentage": "缺失值百分比", + "statistics": "统计信息", + "quantile_statistics": "分位数统计", + "common_values": "常见值", + "histogram": "直方图", + "mode": "更多", + "standard_deviation": "标准差", + "sum": "总和", + "mad": "中位绝对偏差 (MAD)", + "coefficient_of_variation": "变异系数 (CV)", + "kurtosis": "峰度", + "skewness": "偏度", + "range": "范围", + "interquartile_range": "四分位距 (IQR)", + "length": "长度", + "sample": "样本" + }, + "correlations": { + "pearson": "皮尔逊相关系数", + "spearman": "斯皮尔曼相关系数", + "kendall": "肯德尔相关系数", + "phi_k": "Phik相关系数 (φk)", + "cramers": "克拉默V系数 (φc)", + "auto": "自动" + }, + "interactions": { + "scatter_plot": "散点图", + "variable": "变量" + }, + "missing_values": { + "matrix": "矩阵图", + "bar_chart": "条形图", + "heatmap": "热力图", + "dendrogram": "树状图" + }, + "alerts": { + "high_correlation": "高相关性", + "high_cardinality": "高基数", + "constant": "常量", + "zeros": "零值", + "missing": "缺失", + "skewed": "偏斜", + "infinite": "无穷值", + "type_date": "日期类型", + "uniform": "均匀分布", + "unique": "唯一值", + "duplicates": "重复值", + "empty": "空值", + "imbalance": "不平衡", + "near_duplicates": "近似重复", + "non_stationary": "非平稳", + "seasonal": "季节性", + "truncated": "截断", + "unsupported": "不支持", + "dirty_category": "脏数据分类" + }, + "formatting": { + "bytes": "{value} 字节", + "kb": "{value} KB", + "mb": "{value} MB", + "gb": "{value} GB", + "percentage": "{value}%" + }, + "rendering": { + "generate_structure": "生成报告结构", + "html_progress": "渲染HTML报告", + "json_progress": "渲染JSON报告", + "widgets_progress": "渲染组件", + "other_values_count": "其他值 ({other_count})", + "missing": "(缺失)" + }, + "core": { + "unknown": "未知", + "alerts": "警告", + "collapse": "收起", + "container": "容器", + "correlationTable": "相关表", + "dropdown": "下拉选项", + "duplicate": "重复", + "frequencyTable": "频率表", + "frequencyTableSmall": "小频率表", + "html": "HTML", + "image": "Image", + "sample": "样本", + "scores": "评分", + "table": "表", + "toggle_button": "切换按钮", + "variable": "变量", + "variable_info": "变量信息", + "model": { + "bar_count": "总计", + "bar_caption": "按列对空值进行简单可视化。", + "matrix": "矩阵", + "matrix_caption":"零矩阵是一种数据密集显示,可让您快速直观地找出数据完成中的模式。", + "heatmap": "热力图", + "heatmap_caption": "相关热图用于衡量零值相关性:一个变量的存在或缺失对另一个变量的影响程度。", + "first_rows": "前几行", + "last_rows": "最后几行", + "random_sample": "随机抽样" + }, + "structure": { + "correlations": "相关性", + "heatmap": "热力图", + "table": "表", + "overview": { + "values": "值", + "number_variables": "变量数量", + "number_observations": "观测数量", + "missing_cells": "缺失单元格", + "missing_cells_percentage": "缺失单元格占比(%)", + "duplicate_rows": "重复行", + "duplicate_rows_percentage": "重复行占比 (%)", + "total_size_memory": "内存中总大小", + "average_record_memory": "内存中的平均记录大小", + "dataset_statistics": "数据集统计", + "variable_types": "变量类型", + "overview": "概览", + "url": "网址", + "copyright": "版权", + "dataset": "数据集", + "analysis_started": "分析开始", + "analysis_finished": "分析结束", + "duration": "持续时间", + "software_version": "软件版本", + "download_configuration": "下载配置", + "reproduction": "复现", + "variable_descriptions": "变量描述", + "variables": "变量", + "alerts_count": "警告 ({count})", + "number_of_series": "序列数量", + "timeseries_length": "时间序列长度", + "starting_point": "起始点", + "ending_point": "结束点", + "period": "周期", + "timeseries_statistics": "时序统计", + "original": "原始", + "scaled": "缩放", + "time_series": "时间序列", + "interactions": "交互", + "distinct": "独特值", + "distinct_percentage": "独特值占比 (%)", + "missing": "缺失值", + "missing_percentage": "缺失值占比 (%)", + "memory_size": "内存大小", + "file": "文件", + "size": "大小", + "file_size": "文件大小", + "file_size_caption":"具有固定大小文件大小(以字节为单位)的直方图", + "unique": "唯一值", + "unique_help": "唯一值(即数据集中只出现一次的所有值)的数量。", + "unique_percentage": "唯一值占比 (%)", + "max_length": "最大长度", + "median_length": "中位长度", + "mean_length": "平均长度", + "min_length": "最小长度", + "length": "长度", + "length_histogram": "长度直方图", + "histogram_lengths_category": "该类别的长度直方图", + "most_occurring_categories": "最常见类别", + "most_frequent_character_per_category": "每个类别中出现频率最高的字符", + "most_occurring_scripts": "最常见的脚本", + "most_frequent_character_per_script": "每种脚本中出现频率最高的字符", + "most_occurring_blocks": "出现最多的块", + "most_frequent_character_per_block": "每个区块中出现频率最高的字符", + "total_characters": "总字符数", + "distinct_characters": "唯一字符", + "distinct_categories": "唯一类别", + "unicode_categories": "Unicode 类别(点击查看更多信息)", + "distinct_scripts": "唯一脚本", + "unicode_scripts": "Unicode 脚本(点击查看更多信息)", + "distinct_blocks": "唯一块", + "unicode_blocks": "Unicode 块(点击查看更多信息)", + "characters_unicode": "字符与Unicode", + "characters_unicode_caption": "Unicode标准为每个码位分配了字符属性,这些属性可用于分析文本变量。", + "most_occurring_characters": "出现频率最高的字符", + "characters": "字符", + "categories": "分类", + "scripts": "脚本", + "blocks": "块", + "unicode": "Unicode", + "common_values": "常见值", + "common_values_table": "常见值 (表)", + "1st_row": "第1行", + "2nd_row": "第2行", + "3rd_row": "第3行", + "4th_row": "第4行", + "5th_row": "第5行", + "categories_passes_threshold ": "通过阈值的变量类别数量 (config.plot.cat_freq.max_unique)", + "common_values_plot": "常见值 (图)", + "common_words": "通用词汇", + "wordcloud": "词云", + "words": "词汇", + "mean": "均值", + "min": "最小值", + "max": "最大值", + "zeros": "零值", + "zeros_percentage": "零值占比 (%)", + "scatter": "散点图", + "scatterplot": "散点图", + "scatterplot_caption": "复平面散点图", + "mini_histogram": "迷你直方图", + "histogram": "直方图", + "histogram_caption": "等宽直方图", + "extreme_values": "隐藏值", + "histogram_s": "直方图", + "invalid_dates": "无效日期", + "invalid_dates_percentage": "无效日期占比 (%)", + "created": "创建", + "accessed": "访问", + "modified": "修改", + "min_width": "最小宽度", + "median_width": "中位宽度", + "max_width": "最大宽度", + "min_height": "最小高度", + "median_height": "中位高度", + "max_height": "最大高度", + "min_area": "最小区间", + "median_area": "中位区间", + "max_area": "最大区间", + "scatter_plot_image_sizes": "图像大小的散点图", + "scatter_plot":"散点图", + "dimensions": "维度", + "exif_keys": "Exif 键", + "exif_data": "Exif 数据", + "image": "图片", + "common_prefix": "通用前缀", + "unique_stems": "唯一值", + "unique_names": "具有唯一值", + "unique_extensions": "唯一拓展名", + "unique_directories": "唯一目录", + "unique_anchors": "唯一值", + "full": "完整", + "stem": "词干", + "name": "名称", + "extension": "扩展名", + "parent": "父级", + "anchor": "锚点", + "path": "路径", + "infinite": "无限值", + "infinite_percentage": "无限值占比 (%)", + "Negative": "负值", + "Negative_percentage": "负值占比 (%)", + "5_th_percentile": "5% 分位数", + "q1": "Q1", + "median": "中位数", + "q3": "Q3", + "95_th_percentile": "95%分位数", + "range": "范围", + "iqr": "四分位距 (IQR)", + "quantile_statistics": "数据集统计", + "standard_deviation": "标准差", + "cv": "变异系数 (CV)", + "kurtosis": "峰度", + "mad": "绝对中位差 (MAD)", + "skewness": "偏度", + "sum": "总和", + "variance": "变量", + "monotonicity": "单调性", + "descriptive_statistics": "数据集统计", + "statistics": "数据集统计", + "augmented_dickey_fuller_test_value": "增强型Dickey-Fuller检验的p值", + "autocorrelation": "相关性", + "autocorrelation_caption": "ACF 和 PACF", + "timeseries": "时序列", + "timeseries_plot": "时序图", + "scheme": "方案", + "netloc": "主机标识", + "query": "请求参数", + "fragment": "锚点" + } + } + }, + "html": { + "alerts": { + "title": "警告", + "not_present": "此数据集中不存在此警告", + "has_constant_value": "具有恒定值", + "has_constant_length": "具有恒定长度", + "has_dirty_categories": "存在脏分类", + "has_high_cardinality": "具有高基数性", + "distinct_values": "不同值", + "dataset_has": "数据集包含", + "duplicate_rows": "重复行", + "dataset_is_empty": "数据集为空", + "is_highly": "高度", + "correlated_with": "与之相关", + "and": "与", + "other_fields": "其他字段", + "highly_imbalanced": "存在高度不平衡", + "has": "有", + "infinite_values": "无限值", + "missing_values": "缺失值", + "near_duplicate_rows": "近似重复的行", + "non_stationary": "是非平稳的", + "seasonal": "是季节性的", + "highly_skewed": "高度偏斜", + "truncated_files": "截断的文件", + "alert_type_date": "仅包含日期时间值,但属于分类数据。考虑应用", + "uniformly_distributed": "均匀分布", + "unique_values": "具有唯一值", + "alert_unsupported": "这是一个不受支持的类型,请检查是否需要清理或进一步分析", + "zeros": "零值" + }, + "sequence": { + "overview_tabs": { + "brought_to_you_by": "由 YData 为您提供" + } + }, + "dropdown": "选择列", + "frequency_table":{ + "value": "值", + "count": "计数", + "frequency_percentage": "频率 (%)", + "redacted_value": "已隐藏的值", + "no_values_found": "未找到任何值。" + }, + "scores": { + "overall_data_quality": "整体数据质量评分" + }, + "variable_info": { + "no_alerts": "无警报" + } + } +} \ No newline at end of file diff --git a/src/ydata_profiling/i18n/tools.py b/src/ydata_profiling/i18n/tools.py new file mode 100644 index 000000000..d1cc41029 --- /dev/null +++ b/src/ydata_profiling/i18n/tools.py @@ -0,0 +1,119 @@ +""" +Translation tools for ydata-profiling +""" +import json +from pathlib import Path +from typing import Dict, Any, Union +import click + + +def create_translation_template(locale: str = 'en', output_dir: Union[str, Path] = '.'): + """Create a translation template file for customization + + Args: + locale: Source locale to use as template + output_dir: Output directory + """ + from ydata_profiling.i18n import export_translation_template + + output_path = Path(output_dir) / f"{locale}_template.json" + export_translation_template(locale, output_path) + return output_path + + +def validate_translation_file(file_path: Union[str, Path], reference_locale: str = 'en') -> Dict[str, Any]: + """Validate a translation file against reference + + Args: + file_path: Path to translation file to validate + reference_locale: Reference locale to compare against + + Returns: + Validation result dictionary + """ + from ydata_profiling.i18n import _translation_manager + + file_path = Path(file_path) + if not file_path.exists(): + return {"valid": False, "error": f"File {file_path} does not exist"} + + try: + with open(file_path, 'r', encoding='utf-8') as f: + translations = json.load(f) + except Exception as e: + return {"valid": False, "error": f"Invalid JSON: {e}"} + + # Get reference translations + if reference_locale not in _translation_manager.translations: + return {"valid": False, "error": f"Reference locale '{reference_locale}' not found"} + + reference = _translation_manager.translations[reference_locale] + + # Check for missing and extra keys + missing_keys = [] + extra_keys = [] + + def check_keys(ref_dict: dict, trans_dict: dict, prefix: str = ""): + for key, value in ref_dict.items(): + full_key = f"{prefix}.{key}" if prefix else key + if key not in trans_dict: + missing_keys.append(full_key) + elif isinstance(value, dict) and isinstance(trans_dict[key], dict): + check_keys(value, trans_dict[key], full_key) + + for key in trans_dict: + full_key = f"{prefix}.{key}" if prefix else key + if key not in ref_dict: + extra_keys.append(full_key) + + check_keys(reference, translations) + + result = { + "valid": len(missing_keys) == 0, + "missing_keys": missing_keys, + "extra_keys": extra_keys, + "total_keys": len(missing_keys) + len(extra_keys) + } + + return result + + +@click.group() +def cli(): + """YData Profiling Translation Tools""" + pass + + +@cli.command() +@click.option('--locale', '-l', default='en', help='Source locale for template') +@click.option('--output', '-o', default='.', help='Output directory') +def create_template(locale: str, output: str): + """Create a translation template file""" + output_path = create_translation_template(locale, output) + click.echo(f"Translation template created: {output_path}") + + +@cli.command() +@click.argument('file_path') +@click.option('--reference', '-r', default='en', help='Reference locale') +def validate(file_path: str, reference: str): + """Validate a translation file""" + result = validate_translation_file(file_path, reference) + + if result["valid"]: + click.echo(click.style("✓ Translation file is valid!", fg='green')) + else: + click.echo(click.style("✗ Translation file has issues:", fg='red')) + + if 'error' in result: + click.echo(f"Error: {result['error']}") + else: + if result['missing_keys']: + click.echo(f"\nMissing keys ({len(result['missing_keys'])}):") + for key in result['missing_keys']: + click.echo(f" - {key}") + + if result['extra_keys']: + click.echo(f"\nExtra keys ({len(result['extra_keys'])}):") + for key in result['extra_keys']: + click.echo(f" + {key}") \ No newline at end of file diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py index 46ec2dee3..8264bbbd2 100644 --- a/src/ydata_profiling/model/missing.py +++ b/src/ydata_profiling/model/missing.py @@ -5,6 +5,7 @@ import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.i18n import _ class MissingDataBackend: @@ -70,20 +71,20 @@ def get_missing_active(config: Settings, table_stats: dict) -> Dict[str, Any]: missing_map = { "bar": { "min_missing": 0, - "name": "Count", - "caption": "A simple visualization of nullity by column.", + "name": _("core.model.bar_count"), + "caption": _("core.model.bar_caption"), "function": MissingBar(), }, "matrix": { "min_missing": 0, - "name": "Matrix", - "caption": "Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.", + "name": _("core.model.matrix"), + "caption": _("core.model.matrix_caption"), "function": MissingMatrix(), }, "heatmap": { "min_missing": 2, - "name": "Heatmap", - "caption": "The correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another.", + "name": _("core.model.heatmap"), + "caption": _("core.model.heatmap_caption"), "function": MissingHeatmap(), }, } diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index e711acdd9..ba98f0601 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -17,6 +17,7 @@ series_hashable, ) from ydata_profiling.utils.information import DisplayInfo +from ydata_profiling.i18n import _ def get_character_counts_vc(vc: pd.Series) -> pd.Series: @@ -67,7 +68,7 @@ def unicode_summary_vc(vc: pd.Series) -> dict: from unicodedata import category as _category # pylint: disable=import-error category = _category # type: ignore - char_handler = lambda char: "(unknown)" # noqa: E731 + char_handler = lambda char: f"{_("core.unknown")}" # noqa: E731 block = char_handler block_abbr = char_handler category_long = char_handler diff --git a/src/ydata_profiling/model/pandas/sample_pandas.py b/src/ydata_profiling/model/pandas/sample_pandas.py index 10feb8969..9f0a53fe6 100644 --- a/src/ydata_profiling/model/pandas/sample_pandas.py +++ b/src/ydata_profiling/model/pandas/sample_pandas.py @@ -4,6 +4,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.sample import Sample, get_sample +from ydata_profiling.i18n import _ @get_sample.register(Settings, pd.DataFrame) @@ -23,16 +24,16 @@ def pandas_get_sample(config: Settings, df: pd.DataFrame) -> List[Sample]: n_head = config.samples.head if n_head > 0: - samples.append(Sample(id="head", data=df.head(n=n_head), name="First rows")) + samples.append(Sample(id="head", data=df.head(n=n_head), name=_("core.model.first_rows"))) n_tail = config.samples.tail if n_tail > 0: - samples.append(Sample(id="tail", data=df.tail(n=n_tail), name="Last rows")) + samples.append(Sample(id="tail", data=df.tail(n=n_tail), name=_("core.model.last_rows"))) n_random = config.samples.random if n_random > 0: samples.append( - Sample(id="random", data=df.sample(n=n_random), name="Random sample") + Sample(id="random", data=df.sample(n=n_random), name=_("core.model.random_sample")) ) return samples diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index a7d6d9134..9d003f4b9 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -47,6 +47,7 @@ from ydata_profiling.utils.dataframe import hash_dataframe from ydata_profiling.utils.logger import ProfilingLogger from ydata_profiling.utils.paths import get_config +from ydata_profiling.i18n import _, set_locale, get_locale logger = ProfilingLogger(name="ReportLogger") @@ -80,6 +81,7 @@ def __init__( summarizer: Optional[BaseSummarizer] = None, config: Optional[Settings] = None, type_schema: Optional[dict] = None, + locale: Optional[str] = None, **kwargs, ): """Generate a ProfileReport based on a pandas or spark.sql DataFrame @@ -103,8 +105,21 @@ def __init__( typeset: optional user typeset to use for type inference summarizer: optional user summarizer to generate custom summary output type_schema: optional dict containing pairs of `column name`: `type` + locale: language locale for report generation (e.g., 'en', 'zh') **kwargs: other arguments, for valid arguments, check the default configuration file. """ + from ydata_profiling.i18n import set_locale, get_locale + + # Save current language settings + current_locale = get_locale() + + # If the locale parameter is explicitly specified, use it + if locale: + set_locale(locale) + target_locale = locale + else: + # Otherwise, use the current global language setting + target_locale = current_locale self.__validate_inputs(df, minimal, tsmode, config_file, lazy) @@ -123,6 +138,11 @@ def __init__( else: report_config = SparkSettings() + # Ensure that the language settings in the configuration are consistent with the target language + if target_locale != 'en': + report_config.i18n.locale = target_locale + set_locale(target_locale) + groups = [ (explorative, "explorative"), (sensitive, "sensitive"), @@ -144,6 +164,10 @@ def __init__( if kwargs: report_config = report_config.update(kwargs) + # Finally ensure the language setting is correct + report_config.i18n.locale = target_locale + set_locale(target_locale) + report_config.vars.timeseries.active = tsmode if tsmode and sortby: report_config.vars.timeseries.sortby = sortby @@ -411,7 +435,7 @@ def _render_html(self) -> str: report = self.report with tqdm( - total=1, desc="Render HTML", disable=not self.config.progress_bar + total=1, desc=_("rendering.html_progress"), disable=not self.config.progress_bar ) as pbar: html = HTMLReport(copy.deepcopy(report)).render( nav=self.config.html.navbar_show, @@ -440,7 +464,7 @@ def _render_widgets(self) -> Any: with tqdm( total=1, - desc="Render widgets", + desc=_("rendering.widgets_progress"), disable=not self.config.progress_bar, leave=False, ) as pbar: @@ -477,7 +501,7 @@ def encode_it(o: Any) -> Any: description = self.description_set with tqdm( - total=1, desc="Render JSON", disable=not self.config.progress_bar + total=1, desc=_("rendering.json_progress"), disable=not self.config.progress_bar ) as pbar: description_dict = format_summary(description) description_dict = encode_it(description_dict) diff --git a/src/ydata_profiling/report/presentation/core/alerts.py b/src/ydata_profiling/report/presentation/core/alerts.py index 79de56287..113006d5c 100644 --- a/src/ydata_profiling/report/presentation/core/alerts.py +++ b/src/ydata_profiling/report/presentation/core/alerts.py @@ -3,6 +3,7 @@ from ydata_profiling.config import Style from ydata_profiling.model.alerts import Alert from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class Alerts(ItemRenderer): @@ -12,7 +13,7 @@ def __init__( super().__init__("alerts", {"alerts": alerts, "style": style}, **kwargs) def __repr__(self): - return "Alerts" + return _("core.alerts") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/collapse.py b/src/ydata_profiling/report/presentation/core/collapse.py index a7dba34f1..2633588aa 100644 --- a/src/ydata_profiling/report/presentation/core/collapse.py +++ b/src/ydata_profiling/report/presentation/core/collapse.py @@ -3,6 +3,7 @@ from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer from ydata_profiling.report.presentation.core.renderable import Renderable from ydata_profiling.report.presentation.core.toggle_button import ToggleButton +from ydata_profiling.i18n import _ class Collapse(ItemRenderer): @@ -10,7 +11,7 @@ def __init__(self, button: ToggleButton, item: Renderable, **kwargs): super().__init__("collapse", {"button": button, "item": item}, **kwargs) def __repr__(self) -> str: - return "Collapse" + return _("core.collapse") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/container.py b/src/ydata_profiling/report/presentation/core/container.py index c82f06266..e48ff0a62 100644 --- a/src/ydata_profiling/report/presentation/core/container.py +++ b/src/ydata_profiling/report/presentation/core/container.py @@ -1,6 +1,7 @@ from typing import Any, Callable, Optional, Sequence from ydata_profiling.report.presentation.core.renderable import Renderable +from ydata_profiling.i18n import _ class Container(Renderable): @@ -25,7 +26,7 @@ def __init__( self.sequence_type = sequence_type def __str__(self) -> str: - text = "Container\n" + text = f"{_("core.container")}\n" if "items" in self.content: for id, item in enumerate(self.content["items"]): name = str(item).replace("\n", "\n\t") @@ -35,9 +36,9 @@ def __str__(self) -> str: def __repr__(self) -> str: if "name" in self.content: name = self.content["name"] - return f"Container(name={name})" + return f"{_("core.container")}(name={name})" else: - return "Container" + return _("core.container") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/correlation_table.py b/src/ydata_profiling/report/presentation/core/correlation_table.py index 174d0e708..64a47a050 100644 --- a/src/ydata_profiling/report/presentation/core/correlation_table.py +++ b/src/ydata_profiling/report/presentation/core/correlation_table.py @@ -3,6 +3,7 @@ import pandas as pd from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class CorrelationTable(ItemRenderer): @@ -15,7 +16,7 @@ def __init__(self, name: str, correlation_matrix: pd.DataFrame, **kwargs): ) def __repr__(self) -> str: - return "CorrelationTable" + return _("core.correlationTable") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/dropdown.py b/src/ydata_profiling/report/presentation/core/dropdown.py index c1c2f274e..919383aa8 100644 --- a/src/ydata_profiling/report/presentation/core/dropdown.py +++ b/src/ydata_profiling/report/presentation/core/dropdown.py @@ -3,6 +3,7 @@ from ydata_profiling.report.presentation.core.container import Container from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer from ydata_profiling.report.presentation.core.renderable import Renderable +from ydata_profiling.i18n import _ class Dropdown(ItemRenderer): @@ -32,7 +33,7 @@ def __init__( ) def __repr__(self) -> str: - return "Dropdown" + return _("core.dropdown") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/duplicate.py b/src/ydata_profiling/report/presentation/core/duplicate.py index 907d19376..8dfc28a93 100644 --- a/src/ydata_profiling/report/presentation/core/duplicate.py +++ b/src/ydata_profiling/report/presentation/core/duplicate.py @@ -3,6 +3,7 @@ import pandas as pd from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class Duplicate(ItemRenderer): @@ -10,7 +11,7 @@ def __init__(self, name: str, duplicate: pd.DataFrame, **kwargs): super().__init__("duplicate", {"duplicate": duplicate}, name=name, **kwargs) def __repr__(self) -> str: - return "Duplicate" + return _("core.duplicate") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/frequency_table.py b/src/ydata_profiling/report/presentation/core/frequency_table.py index 060e82da3..cd12da85d 100644 --- a/src/ydata_profiling/report/presentation/core/frequency_table.py +++ b/src/ydata_profiling/report/presentation/core/frequency_table.py @@ -1,6 +1,7 @@ from typing import Any from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class FrequencyTable(ItemRenderer): @@ -8,7 +9,7 @@ def __init__(self, rows: list, redact: bool, **kwargs): super().__init__("frequency_table", {"rows": rows, "redact": redact}, **kwargs) def __repr__(self) -> str: - return "FrequencyTable" + return _("core.frequencyTable") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/frequency_table_small.py b/src/ydata_profiling/report/presentation/core/frequency_table_small.py index c7036b3b7..e1d61597f 100644 --- a/src/ydata_profiling/report/presentation/core/frequency_table_small.py +++ b/src/ydata_profiling/report/presentation/core/frequency_table_small.py @@ -1,6 +1,7 @@ from typing import Any, List from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class FrequencyTableSmall(ItemRenderer): @@ -10,7 +11,7 @@ def __init__(self, rows: List[Any], redact: bool, **kwargs): ) def __repr__(self) -> str: - return "FrequencyTableSmall" + return _("core.frequencyTableSmall") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/html.py b/src/ydata_profiling/report/presentation/core/html.py index a93e53656..a7aa56621 100644 --- a/src/ydata_profiling/report/presentation/core/html.py +++ b/src/ydata_profiling/report/presentation/core/html.py @@ -1,6 +1,7 @@ from typing import Any from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class HTML(ItemRenderer): @@ -8,7 +9,7 @@ def __init__(self, content: str, **kwargs): super().__init__("html", {"html": content}, **kwargs) def __repr__(self) -> str: - return "HTML" + return _("core.html") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/image.py b/src/ydata_profiling/report/presentation/core/image.py index 4d991922d..ee6b0cf04 100644 --- a/src/ydata_profiling/report/presentation/core/image.py +++ b/src/ydata_profiling/report/presentation/core/image.py @@ -2,6 +2,7 @@ from ydata_profiling.config import ImageType from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class Image(ItemRenderer): @@ -28,7 +29,7 @@ def __init__( ) def __repr__(self) -> str: - return "Image" + return _("core.image") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/sample.py b/src/ydata_profiling/report/presentation/core/sample.py index 6ce6194fd..ac0e26e0d 100644 --- a/src/ydata_profiling/report/presentation/core/sample.py +++ b/src/ydata_profiling/report/presentation/core/sample.py @@ -3,6 +3,7 @@ import pandas as pd from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class Sample(ItemRenderer): @@ -14,7 +15,7 @@ def __init__( ) def __repr__(self) -> str: - return "Sample" + return _("core.sample") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/scores.py b/src/ydata_profiling/report/presentation/core/scores.py index 7ff70570a..7414cef4d 100644 --- a/src/ydata_profiling/report/presentation/core/scores.py +++ b/src/ydata_profiling/report/presentation/core/scores.py @@ -5,6 +5,7 @@ from ydata_profiling.config import Style from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class Scores(ItemRenderer): @@ -26,7 +27,7 @@ def __init__( super().__init__("scores", content=content, **kwargs) def __repr__(self) -> str: - return "Scores" + return _("core.scores") def render(self) -> Any: raise NotImplementedError("Handled by flavour-specific class") diff --git a/src/ydata_profiling/report/presentation/core/table.py b/src/ydata_profiling/report/presentation/core/table.py index 46fa9e1ad..78c2c95d1 100644 --- a/src/ydata_profiling/report/presentation/core/table.py +++ b/src/ydata_profiling/report/presentation/core/table.py @@ -2,6 +2,7 @@ from ydata_profiling.config import Style from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class Table(ItemRenderer): @@ -20,7 +21,7 @@ def __init__( ) def __repr__(self) -> str: - return "Table" + return _("core.table") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/toggle_button.py b/src/ydata_profiling/report/presentation/core/toggle_button.py index c6ce1b861..054a4d35c 100644 --- a/src/ydata_profiling/report/presentation/core/toggle_button.py +++ b/src/ydata_profiling/report/presentation/core/toggle_button.py @@ -1,6 +1,7 @@ from typing import Any from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class ToggleButton(ItemRenderer): @@ -8,7 +9,7 @@ def __init__(self, text: str, **kwargs): super().__init__("toggle_button", {"text": text}, **kwargs) def __repr__(self) -> str: - return "ToggleButton" + return _("core.toggle_button") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/variable.py b/src/ydata_profiling/report/presentation/core/variable.py index cdf063202..0a24c6517 100644 --- a/src/ydata_profiling/report/presentation/core/variable.py +++ b/src/ydata_profiling/report/presentation/core/variable.py @@ -2,6 +2,7 @@ from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer from ydata_profiling.report.presentation.core.renderable import Renderable +from ydata_profiling.i18n import _ class Variable(ItemRenderer): @@ -20,13 +21,13 @@ def __str__(self): top_text = str(self.content["top"]).replace("\n", "\n\t") bottom_text = str(self.content["bottom"]).replace("\n", "\n\t") - text = "Variable\n" + text = f"{_("core.variable")}\n" text += f"- top: {top_text}" text += f"- bottom: {bottom_text}" return text def __repr__(self): - return "Variable" + return _("core.variable") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/core/variable_info.py b/src/ydata_profiling/report/presentation/core/variable_info.py index 9eaa54a39..6c5ff966c 100644 --- a/src/ydata_profiling/report/presentation/core/variable_info.py +++ b/src/ydata_profiling/report/presentation/core/variable_info.py @@ -3,6 +3,7 @@ from ydata_profiling.config import Style from ydata_profiling.model.alerts import Alert from ydata_profiling.report.presentation.core.item_renderer import ItemRenderer +from ydata_profiling.i18n import _ class VariableInfo(ItemRenderer): @@ -30,7 +31,7 @@ def __init__( ) def __repr__(self) -> str: - return "VariableInfo" + return _("core.variable_info") def render(self) -> Any: raise NotImplementedError() diff --git a/src/ydata_profiling/report/presentation/flavours/html/__init__.py b/src/ydata_profiling/report/presentation/flavours/html/__init__.py index 50e234f77..72be96f2d 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/__init__.py +++ b/src/ydata_profiling/report/presentation/flavours/html/__init__.py @@ -25,6 +25,7 @@ from ydata_profiling.report.presentation.flavours.html.variable_info import ( HTMLVariableInfo, ) +from ydata_profiling.report.presentation.flavours.html.i18n_extension import I18nExtension __all__ = [ "HTMLCollapse", @@ -44,4 +45,5 @@ "HTMLAlerts", "HTMLCorrelationTable", "HTMLScores", + "I18nExtension", ] diff --git a/src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py b/src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py new file mode 100644 index 000000000..47dbf0f5e --- /dev/null +++ b/src/ydata_profiling/report/presentation/flavours/html/i18n_extension.py @@ -0,0 +1,33 @@ +""" +Jinja2 internationalization extension for ydata-profiling +""" +from jinja2 import nodes +from jinja2.ext import Extension +from ydata_profiling.i18n import _ + +class I18nExtension(Extension): + """Jinja2 extension for internationalization""" + + tags = {'trans'} + + def __init__(self, environment): + super().__init__(environment) + environment.globals['_'] = _ + environment.globals['gettext'] = _ + environment.filters['trans'] = self.translate_filter + + def translate_filter(self, key, **kwargs): + """Filter for translating keys in templates""" + return _(key, **kwargs) + + def parse(self, parser): + """Parse trans tag for {% trans %} syntax""" + lineno = next(parser.stream).lineno + key = parser.parse_expression() + return nodes.Output([ + nodes.Call( + nodes.Name('_', 'load'), + [key], + [] + ) + ]).set_lineno(lineno) \ No newline at end of file diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates.py b/src/ydata_profiling/report/presentation/flavours/html/templates.py index 85e24a46a..69a8dc1b8 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates.py +++ b/src/ydata_profiling/report/presentation/flavours/html/templates.py @@ -6,20 +6,33 @@ from ydata_profiling.config import Settings from ydata_profiling.report.formatters import fmt, fmt_badge, fmt_numeric, fmt_percent +from ydata_profiling.i18n import _ +from ydata_profiling.report.presentation.flavours.html.i18n_extension import I18nExtension # Initializing Jinja package_loader = jinja2.PackageLoader( "ydata_profiling", "report/presentation/flavours/html/templates" ) jinja2_env = jinja2.Environment( - lstrip_blocks=True, trim_blocks=True, loader=package_loader + lstrip_blocks=True, + trim_blocks=True, + loader=package_loader, + extensions=[I18nExtension] # Add internationalization extension ) + +# Adding translation function to Jinja2 environment +jinja2_env.globals['_'] = _ +jinja2_env.globals['gettext'] = _ + jinja2_env.filters["is_list"] = lambda x: isinstance(x, list) jinja2_env.filters["fmt_badge"] = fmt_badge jinja2_env.filters["fmt_percent"] = fmt_percent jinja2_env.filters["fmt_numeric"] = fmt_numeric jinja2_env.filters["fmt"] = fmt +# Add translation filter +jinja2_env.filters["trans"] = lambda key, **kwargs: _(key, **kwargs) + def template(template_name: str) -> jinja2.Template: """Get the template object given the name. diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts.html index 3a53428b8..f95ae6bbe 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts.html @@ -1,6 +1,6 @@
-

Alerts

+

{{ _('html.alerts.title') }}

{% if alerts is mapping %} @@ -19,7 +19,7 @@ {% if not alert._is_empty %} {% include 'alerts/alert_' + alert.alert_type.name | lower + '.html' %} {% else %} - Alert not present in this dataset + {{ _('html.alerts.not_present') }} {% endif %} {% endfor %} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html index 2689418bd..81aea8265 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html @@ -1 +1 @@ -{{ alert.column_name }} has constant value "{{ alert.values['value_counts_without_nan'].index[0] }}" +{{ alert.column_name }} {{ _('html.alerts.has_constant_value')}} "{{ alert.values['value_counts_without_nan'].index[0] }}" diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant_length.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant_length.html index 8b4ecbc8c..9b82b9f62 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant_length.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant_length.html @@ -1 +1 @@ -{{ alert.column_name }} has constant length +{{ alert.column_name }} {{ _('html.alerts.has_constant_length')}} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html index f1683ca4e..ac6e3e1fa 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html @@ -1 +1 @@ -{{ alert.column_name }} has dirty categories: {{ alert.values['n_fuzzy_vals'] }} distinct values +{{ alert.column_name }} {{ _('html.alerts.has_dirty_categories')}}: {{ alert.values['n_fuzzy_vals'] }} {{ _('html.alerts.distinct_values')}} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_duplicates.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_duplicates.html index 9f0302f85..63ad20e17 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_duplicates.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_duplicates.html @@ -1 +1 @@ -Dataset has {{ alert.values['n_duplicates'] }} ({{ alert.values['p_duplicates'] | fmt_percent }}) duplicate rows +{{ _('html.alerts.dataset_has')}} {{ alert.values['n_duplicates'] }} ({{ alert.values['p_duplicates'] | fmt_percent }}) {{ _('html.alerts.duplicate_rows')}} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_empty.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_empty.html index a676c9577..cab787114 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_empty.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_empty.html @@ -1 +1 @@ -Dataset is empty +{{ _('html.alerts.dataset_is_empty') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_cardinality.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_cardinality.html index 7f5a06000..a201b7d92 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_cardinality.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_cardinality.html @@ -1 +1 @@ -{{ alert.column_name }} has a high cardinality: {{ alert.values['n_distinct'] }} distinct values +{{ alert.column_name }} {{ _('html.alerts.has_high_cardinality')}}: {{ alert.values['n_distinct'] }} {{ _('html.alerts.distinct_values')}} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_correlation.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_correlation.html index 6a0ec6ac6..9a119a7cf 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_correlation.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_high_correlation.html @@ -1,4 +1,4 @@ {{ alert.column_name }} - is highly {{ alert.values['corr'] }} correlated with {{ alert.values['fields'][0] }} {% if alert.values['fields'] | length > 1 %} and {{ alert.values['fields'] | length - 1 }} other fields{% endif %} + {{ _('html.alerts.is_highly')}} {{ alert.values['corr'] }} {{ _('html.alerts.correlated_with')}} {{ alert.values['fields'][0] }} {% if alert.values['fields'] | length > 1 %} {{ _('html.alerts.and')}} {{ alert.values['fields'] | length - 1 }} {{ _('html.alerts.other_fields')}}{% endif %} {# (ρ = {{ alert.values['correlation'] | fmt_numeric }}) #} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_imbalance.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_imbalance.html index 6ac9be4fd..27e4c405c 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_imbalance.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_imbalance.html @@ -1 +1 @@ -{{ alert.column_name }} is highly imbalanced ({{ alert.values['imbalance'] | fmt_percent}}) \ No newline at end of file +{{ alert.column_name }} {{ _('html.alerts.highly_imbalanced')}} ({{ alert.values['imbalance'] | fmt_percent}}) \ No newline at end of file diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_infinite.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_infinite.html index 394644676..aeb054c4e 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_infinite.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_infinite.html @@ -1 +1 @@ -{{ alert.column_name }} has {{ alert.values['n_infinite'] }} ({{ alert.values['p_infinite'] | fmt_percent }}) infinite values +{{ alert.column_name }} {{ _('html.alerts.has') }} {{ alert.values['n_infinite'] }} ({{ alert.values['p_infinite'] | fmt_percent }}) {{ _('html.alerts.infinite_values') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_missing.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_missing.html index 9cc3ed49d..1bab15eaf 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_missing.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_missing.html @@ -1 +1 @@ -{{ alert.column_name }} has {{ alert.values['n_missing'] }} ({{ alert.values['p_missing'] | fmt_percent }}) missing values +{{ alert.column_name }} {{ _('html.alerts.has') }} {{ alert.values['n_missing'] }} ({{ alert.values['p_missing'] | fmt_percent }}) {{ _('html.alerts.missing_values') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html index 10ad13c82..9183ded55 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html @@ -1 +1 @@ -Dataset has {{ alert.values['n_near_dups'] }} ({{ alert.values['p_near_dups'] | fmt_percent }}) near duplicate rows +{{ _('html.alerts.dataset_has')}} {{ alert.values['n_near_dups'] }} ({{ alert.values['p_near_dups'] | fmt_percent }}) {{ _('html.alerts.near_duplicate_rows') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_non_stationary.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_non_stationary.html index d9f44b88b..44a7e0504 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_non_stationary.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_non_stationary.html @@ -1 +1 @@ -{{ alert.column_name }} is non stationary +{{ alert.column_name }} {{ _('html.alerts.non_stationary') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_seasonal.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_seasonal.html index eb707b2bf..520df4f33 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_seasonal.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_seasonal.html @@ -1 +1 @@ -{{ alert.column_name }} is seasonal +{{ alert.column_name }} {{ _('html.alerts.seasonal') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_skewed.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_skewed.html index 54a8e09e7..c4037a2e4 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_skewed.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_skewed.html @@ -1 +1 @@ -{{ alert.column_name }} is highly skewed (γ1 = {{ alert.values['skewness'] | fmt_numeric }}) +{{ alert.column_name }} {{ _('html.alerts.highly_skewed') }} (γ1 = {{ alert.values['skewness'] | fmt_numeric }}) diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_truncated.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_truncated.html index ef9d3571b..0a2883aa4 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_truncated.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_truncated.html @@ -1 +1 @@ -{{ alert.column_name }} has {{ alert.values['n_truncated'] }} ({{ alert.values['p_truncated'] | fmt_percent }}) truncated files +{{ alert.column_name }} {{ _('html.alerts.has') }} {{ alert.values['n_truncated'] }} ({{ alert.values['p_truncated'] | fmt_percent }}) {{ _('html.alerts.truncated_files') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_type_date.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_type_date.html index d827bf27d..4e02beb84 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_type_date.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_type_date.html @@ -1 +1 @@ -{{ alert.column_name }} only contains datetime values, but is categorical. Consider applying pd.to_datetime() +{{ alert.column_name }} {{ _('html.alerts.alert_type_date') }} pd.to_datetime() diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_uniform.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_uniform.html index 6971b74da..f2c51fb8b 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_uniform.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_uniform.html @@ -1 +1 @@ -{{ alert.column_name }} is uniformly distributed +{{ alert.column_name }} {{ _('html.alerts.uniformly_distributed') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unique.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unique.html index e4c305794..7fabb4f9e 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unique.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unique.html @@ -1 +1 @@ -{{ alert.column_name }} has unique values +{{ alert.column_name }} {{ _('html.alerts.unique_values') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unsupported.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unsupported.html index 3a3d95c61..fcef8888b 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unsupported.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_unsupported.html @@ -1 +1 @@ -{{ alert.column_name }} is an unsupported type, check if it needs cleaning or further analysis +{{ alert.column_name }} {{ _('html.alerts.alert_unsupported') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_zeros.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_zeros.html index f9cedb455..307811978 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_zeros.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_zeros.html @@ -1 +1 @@ -{{ alert.column_name }} has {{ alert.values['n_zeros'] }} ({{ alert.values['p_zeros'] | fmt_percent }}) zeros +{{ alert.column_name }} {{ _('html.alerts.has') }} {{ alert.values['n_zeros'] }} ({{ alert.values['p_zeros'] | fmt_percent }}) {{ _('html.alerts.zeros') }} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/dropdown.html b/src/ydata_profiling/report/presentation/flavours/html/templates/dropdown.html index 630d08845..1d76ba3a4 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/dropdown.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/dropdown.html @@ -2,7 +2,7 @@
{% endif %}
- - - + + + @@ -12,7 +12,7 @@ {% for row in rows %} {% if row['extra_class'] | length == 0 and redact %} - {% else %} @@ -37,7 +37,7 @@ {% endfor %} {% else %} - + {% endif %} diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/scores.html b/src/ydata_profiling/report/presentation/flavours/html/templates/scores.html index 90ad82cda..af875fa36 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/scores.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/scores.html @@ -36,7 +36,7 @@
-
Overall Data Quality Score
+
{{ _('html.scores.overall_data_quality') }}
{% for i in range(overall_score | length) %}
diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html b/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html index e068278fb..a1f2378da 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/sequence/overview_tabs.html @@ -1,6 +1,6 @@ {% if tabs | length > 0 %} {% if oss %} -

Brought to you by YData

+

{{ _('html.sequence.overview_tabs.brought_to_you_by') | safe }}

{% endif %}
ValueCountFrequency (%){{ _('html.frequency_table.value') }}{{ _('html.frequency_table.count') }}{{ _('html.frequency_table.frequency_percentage') }}
+
No values found.{{ _('html.frequency_table.no_values_found') }}