Skip to content

Commit 43650c7

Browse files
CLDR-18745 llm_cldr_validator.py (#4903)
1 parent 654ac0f commit 43650c7

File tree

1 file changed

+169
-0
lines changed

1 file changed

+169
-0
lines changed
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import json
2+
from openai import OpenAI
3+
4+
# --- Part 1: LLM Generator ---
5+
# This function calls the LLM to get its understanding of the data.
6+
7+
def generate_data_with_llm(user_prompt: str) -> dict:
8+
"""
9+
Uses the OpenAI API to generate CLDR-like data.
10+
Note: The 'DataType' from the LLM is crucial for looking up in the CLDR file.
11+
"""
12+
try:
13+
# --- PASTE YOUR OPENAI API KEY HERE ---
14+
client = OpenAI(api_key="YOUR KEY")
15+
16+
system_instructions = """
17+
You are an expert assistant that provides locale-specific data. Based on the user's prompt, generate a single, raw JSON object.
18+
Your 'DataType' value must be chosen from this specific list to match the CLDR file:
19+
'Area (Default)', 'Area (Floor)', 'Area (Geographic)', 'Area (Land)', 'Blood Glucose', 'Vehicle Fuel Consumption', 'Duration (Media)',
20+
'Food Energy', 'Length (Person Height)', 'Length (Road)', 'Speed (Default)', 'Temperature (Weather)', 'Volume (Fluid)'.
21+
The JSON object must follow this exact structure:
22+
{
23+
"Entity": "<Country or Region Name, e.g., United Kingdom>",
24+
"CountryCode": "<Two-letter ISO code, e.g., GB>",
25+
"DataType": "<Chosen from the list above, e.g., Speed (Default)>",
26+
"Data": {
27+
"Item 1": {
28+
"Value": "<The primary unit, formatted like 'mile-per-hour'>",
29+
"Context": "<A brief description of its primary use>"
30+
},
31+
"Item 2": {
32+
"Value": "<A secondary unit, if applicable>",
33+
"Context": "<A brief description>"
34+
}
35+
}
36+
}
37+
Do not include any text or explanation before or after the JSON object. Omit "Item 2" if not relevant.
38+
"""
39+
response = client.chat.completions.create(
40+
model="gpt-4o-mini",
41+
messages=[
42+
{"role": "system", "content": system_instructions},
43+
{"role": "user", "content": user_prompt}
44+
],
45+
response_format={"type": "json_object"}
46+
)
47+
return json.loads(response.choices[0].message.content)
48+
49+
except Exception as e:
50+
return {"error": str(e)}
51+
52+
53+
# --- Part 2: CLDR Data Fetcher ---
54+
# These functions load and find the ground truth from your local JSON file.
55+
56+
def load_cldr_data(filepath: str = "unitPreferenceData.json") -> dict:
57+
"""Loads the CLDR data from the local JSON file."""
58+
try:
59+
# 'with open(...)' opens the file and assigns it to 'f'
60+
with open(filepath, 'r', encoding='utf-8') as f:
61+
# Pass the file object 'f' to json.load()
62+
return json.load(f)
63+
except FileNotFoundError:
64+
print(f"Error: The file '{filepath}' was not found. Please make sure it's in the same directory.")
65+
return None
66+
except json.JSONDecodeError:
67+
print(f"Error: The file '{filepath}' is not a valid JSON file.")
68+
return None
69+
70+
def fetch_from_cldr(cldr_data: dict, cldr_category: str, cldr_usage: str, region_code: str) -> list:
71+
"""Fetches the preferred units from the parsed CLDR data."""
72+
if not cldr_data:
73+
return []
74+
try:
75+
# Navigate through the CLDR JSON structure
76+
preferences = cldr_data["supplemental"]["unitPreferenceData"][cldr_category][cldr_usage]
77+
78+
# Get region-specific units if they exist, otherwise fall back to the world default ("001")
79+
region_units = preferences.get(region_code, preferences.get("001", []))
80+
81+
# Extract just the unit names into a list
82+
return [item['unit'] for item in region_units]
83+
except KeyError:
84+
# Path not found in the CLDR data
85+
return []
86+
87+
# --- Part 3: The Validator ---
88+
# This is the main logic that ties everything together.
89+
90+
if __name__ == "__main__":
91+
# Load the ground truth data once
92+
cldr_data = load_cldr_data()
93+
if not cldr_data:
94+
exit() # Stop if the CLDR file can't be loaded
95+
96+
# A simple mapping from the LLM's DataType to the keys in the CLDR json file
97+
# Format: "LLM DataType": ("cldr_category", "cldr_usage")
98+
cldr_mapping = {
99+
"Area (Default)": ("area", "default"),
100+
"Area (Floor)": ("area", "floor"),
101+
"Area (Geographic)": ("area", "geograph"), # Added for completeness
102+
"Area (Land)": ("area", "land"), # <-- THIS IS THE FIX
103+
"Speed (Default)": ("speed", "default"),
104+
"Temperature (Weather)": ("temperature", "weather"),
105+
"Length (Person Height)": ("length", "person-height"),
106+
"Length (Road)": ("length", "road"),
107+
"Vehicle Fuel Consumption": ("consumption", "vehicle-fuel"),
108+
"Blood Glucose": ("concentration", "blood-glucose"), # Added for completeness
109+
}
110+
# Get user input
111+
user_prompt = input("Enter your question about local data: ")
112+
113+
# Step 1: Get data from the LLM
114+
print("\n Generating data with LLM...")
115+
llm_output = generate_data_with_llm(user_prompt)
116+
117+
if "error" in llm_output:
118+
print(f"LLM Error: {llm_output['error']}")
119+
exit()
120+
121+
print("--- LLM Generated Data ---")
122+
print(json.dumps(llm_output, indent=4))
123+
124+
# Step 2: Extract key info from LLM output to perform the lookup
125+
llm_data_type = llm_output.get("DataType")
126+
llm_country_code = llm_output.get("CountryCode")
127+
llm_units = [item["Value"] for item in llm_output.get("Data", {}).values()]
128+
129+
if not all([llm_data_type, llm_country_code, llm_units]):
130+
print("\nError: LLM output was missing required keys (DataType, CountryCode, or Data).")
131+
exit()
132+
133+
# Step 3: Fetch the corresponding ground truth from CLDR
134+
print("\n Fetching ground truth from CLDR file...")
135+
if llm_data_type in cldr_mapping:
136+
category, usage = cldr_mapping[llm_data_type]
137+
cldr_units = fetch_from_cldr(cldr_data, category, usage, llm_country_code)
138+
else:
139+
cldr_units = []
140+
141+
# Step 4: Compare the results and create the final output
142+
print("\n Comparing results...")
143+
comparison_result = {
144+
"ValidationInput": {
145+
"Prompt": user_prompt,
146+
"LLM_Entity": llm_output.get("Entity"),
147+
"LLM_CountryCode": llm_country_code,
148+
"CLDR_Lookup": f"Category: '{category}', Usage: '{usage}', Region: '{llm_country_code}'"
149+
},
150+
"LLM_Units_Found": llm_units,
151+
"CLDR_Units_Found": cldr_units,
152+
"Comparison": []
153+
}
154+
155+
# Compare each unit found by the LLM
156+
for i, llm_unit in enumerate(llm_units):
157+
cldr_unit = cldr_units[i] if i < len(cldr_units) else "N/A"
158+
match_status = "Match" if llm_unit == cldr_unit else "Mismatch"
159+
comparison_result["Comparison"].append({
160+
f"Unit_{i+1}": {
161+
"LLM_Unit": llm_unit,
162+
"CLDR_Unit": cldr_unit,
163+
"Status": match_status
164+
}
165+
})
166+
167+
# Print the final comparison
168+
print("\n--- Final Validation Report ---")
169+
print(json.dumps(comparison_result, indent=4))

0 commit comments

Comments
 (0)