|
| 1 | +import json |
| 2 | +from openai import OpenAI |
| 3 | + |
| 4 | +# --- Part 1: LLM Generator --- |
| 5 | +# This function calls the LLM to get its understanding of the data. |
| 6 | + |
| 7 | +def generate_data_with_llm(user_prompt: str) -> dict: |
| 8 | + """ |
| 9 | + Uses the OpenAI API to generate CLDR-like data. |
| 10 | + Note: The 'DataType' from the LLM is crucial for looking up in the CLDR file. |
| 11 | + """ |
| 12 | + try: |
| 13 | + # --- PASTE YOUR OPENAI API KEY HERE --- |
| 14 | + client = OpenAI(api_key="YOUR KEY") |
| 15 | + |
| 16 | + system_instructions = """ |
| 17 | + You are an expert assistant that provides locale-specific data. Based on the user's prompt, generate a single, raw JSON object. |
| 18 | + Your 'DataType' value must be chosen from this specific list to match the CLDR file: |
| 19 | + 'Area (Default)', 'Area (Floor)', 'Area (Geographic)', 'Area (Land)', 'Blood Glucose', 'Vehicle Fuel Consumption', 'Duration (Media)', |
| 20 | + 'Food Energy', 'Length (Person Height)', 'Length (Road)', 'Speed (Default)', 'Temperature (Weather)', 'Volume (Fluid)'. |
| 21 | + The JSON object must follow this exact structure: |
| 22 | + { |
| 23 | + "Entity": "<Country or Region Name, e.g., United Kingdom>", |
| 24 | + "CountryCode": "<Two-letter ISO code, e.g., GB>", |
| 25 | + "DataType": "<Chosen from the list above, e.g., Speed (Default)>", |
| 26 | + "Data": { |
| 27 | + "Item 1": { |
| 28 | + "Value": "<The primary unit, formatted like 'mile-per-hour'>", |
| 29 | + "Context": "<A brief description of its primary use>" |
| 30 | + }, |
| 31 | + "Item 2": { |
| 32 | + "Value": "<A secondary unit, if applicable>", |
| 33 | + "Context": "<A brief description>" |
| 34 | + } |
| 35 | + } |
| 36 | + } |
| 37 | + Do not include any text or explanation before or after the JSON object. Omit "Item 2" if not relevant. |
| 38 | + """ |
| 39 | + response = client.chat.completions.create( |
| 40 | + model="gpt-4o-mini", |
| 41 | + messages=[ |
| 42 | + {"role": "system", "content": system_instructions}, |
| 43 | + {"role": "user", "content": user_prompt} |
| 44 | + ], |
| 45 | + response_format={"type": "json_object"} |
| 46 | + ) |
| 47 | + return json.loads(response.choices[0].message.content) |
| 48 | + |
| 49 | + except Exception as e: |
| 50 | + return {"error": str(e)} |
| 51 | + |
| 52 | + |
| 53 | +# --- Part 2: CLDR Data Fetcher --- |
| 54 | +# These functions load and find the ground truth from your local JSON file. |
| 55 | + |
| 56 | +def load_cldr_data(filepath: str = "unitPreferenceData.json") -> dict: |
| 57 | + """Loads the CLDR data from the local JSON file.""" |
| 58 | + try: |
| 59 | + # 'with open(...)' opens the file and assigns it to 'f' |
| 60 | + with open(filepath, 'r', encoding='utf-8') as f: |
| 61 | + # Pass the file object 'f' to json.load() |
| 62 | + return json.load(f) |
| 63 | + except FileNotFoundError: |
| 64 | + print(f"Error: The file '{filepath}' was not found. Please make sure it's in the same directory.") |
| 65 | + return None |
| 66 | + except json.JSONDecodeError: |
| 67 | + print(f"Error: The file '{filepath}' is not a valid JSON file.") |
| 68 | + return None |
| 69 | + |
| 70 | +def fetch_from_cldr(cldr_data: dict, cldr_category: str, cldr_usage: str, region_code: str) -> list: |
| 71 | + """Fetches the preferred units from the parsed CLDR data.""" |
| 72 | + if not cldr_data: |
| 73 | + return [] |
| 74 | + try: |
| 75 | + # Navigate through the CLDR JSON structure |
| 76 | + preferences = cldr_data["supplemental"]["unitPreferenceData"][cldr_category][cldr_usage] |
| 77 | + |
| 78 | + # Get region-specific units if they exist, otherwise fall back to the world default ("001") |
| 79 | + region_units = preferences.get(region_code, preferences.get("001", [])) |
| 80 | + |
| 81 | + # Extract just the unit names into a list |
| 82 | + return [item['unit'] for item in region_units] |
| 83 | + except KeyError: |
| 84 | + # Path not found in the CLDR data |
| 85 | + return [] |
| 86 | + |
| 87 | +# --- Part 3: The Validator --- |
| 88 | +# This is the main logic that ties everything together. |
| 89 | + |
| 90 | +if __name__ == "__main__": |
| 91 | + # Load the ground truth data once |
| 92 | + cldr_data = load_cldr_data() |
| 93 | + if not cldr_data: |
| 94 | + exit() # Stop if the CLDR file can't be loaded |
| 95 | + |
| 96 | + # A simple mapping from the LLM's DataType to the keys in the CLDR json file |
| 97 | + # Format: "LLM DataType": ("cldr_category", "cldr_usage") |
| 98 | + cldr_mapping = { |
| 99 | + "Area (Default)": ("area", "default"), |
| 100 | + "Area (Floor)": ("area", "floor"), |
| 101 | + "Area (Geographic)": ("area", "geograph"), # Added for completeness |
| 102 | + "Area (Land)": ("area", "land"), # <-- THIS IS THE FIX |
| 103 | + "Speed (Default)": ("speed", "default"), |
| 104 | + "Temperature (Weather)": ("temperature", "weather"), |
| 105 | + "Length (Person Height)": ("length", "person-height"), |
| 106 | + "Length (Road)": ("length", "road"), |
| 107 | + "Vehicle Fuel Consumption": ("consumption", "vehicle-fuel"), |
| 108 | + "Blood Glucose": ("concentration", "blood-glucose"), # Added for completeness |
| 109 | + } |
| 110 | + # Get user input |
| 111 | + user_prompt = input("Enter your question about local data: ") |
| 112 | + |
| 113 | + # Step 1: Get data from the LLM |
| 114 | + print("\n Generating data with LLM...") |
| 115 | + llm_output = generate_data_with_llm(user_prompt) |
| 116 | + |
| 117 | + if "error" in llm_output: |
| 118 | + print(f"LLM Error: {llm_output['error']}") |
| 119 | + exit() |
| 120 | + |
| 121 | + print("--- LLM Generated Data ---") |
| 122 | + print(json.dumps(llm_output, indent=4)) |
| 123 | + |
| 124 | + # Step 2: Extract key info from LLM output to perform the lookup |
| 125 | + llm_data_type = llm_output.get("DataType") |
| 126 | + llm_country_code = llm_output.get("CountryCode") |
| 127 | + llm_units = [item["Value"] for item in llm_output.get("Data", {}).values()] |
| 128 | + |
| 129 | + if not all([llm_data_type, llm_country_code, llm_units]): |
| 130 | + print("\nError: LLM output was missing required keys (DataType, CountryCode, or Data).") |
| 131 | + exit() |
| 132 | + |
| 133 | + # Step 3: Fetch the corresponding ground truth from CLDR |
| 134 | + print("\n Fetching ground truth from CLDR file...") |
| 135 | + if llm_data_type in cldr_mapping: |
| 136 | + category, usage = cldr_mapping[llm_data_type] |
| 137 | + cldr_units = fetch_from_cldr(cldr_data, category, usage, llm_country_code) |
| 138 | + else: |
| 139 | + cldr_units = [] |
| 140 | + |
| 141 | + # Step 4: Compare the results and create the final output |
| 142 | + print("\n Comparing results...") |
| 143 | + comparison_result = { |
| 144 | + "ValidationInput": { |
| 145 | + "Prompt": user_prompt, |
| 146 | + "LLM_Entity": llm_output.get("Entity"), |
| 147 | + "LLM_CountryCode": llm_country_code, |
| 148 | + "CLDR_Lookup": f"Category: '{category}', Usage: '{usage}', Region: '{llm_country_code}'" |
| 149 | + }, |
| 150 | + "LLM_Units_Found": llm_units, |
| 151 | + "CLDR_Units_Found": cldr_units, |
| 152 | + "Comparison": [] |
| 153 | + } |
| 154 | + |
| 155 | + # Compare each unit found by the LLM |
| 156 | + for i, llm_unit in enumerate(llm_units): |
| 157 | + cldr_unit = cldr_units[i] if i < len(cldr_units) else "N/A" |
| 158 | + match_status = "Match" if llm_unit == cldr_unit else "Mismatch" |
| 159 | + comparison_result["Comparison"].append({ |
| 160 | + f"Unit_{i+1}": { |
| 161 | + "LLM_Unit": llm_unit, |
| 162 | + "CLDR_Unit": cldr_unit, |
| 163 | + "Status": match_status |
| 164 | + } |
| 165 | + }) |
| 166 | + |
| 167 | + # Print the final comparison |
| 168 | + print("\n--- Final Validation Report ---") |
| 169 | + print(json.dumps(comparison_result, indent=4)) |
0 commit comments