3030DATASET_MAPPING = {
3131 "agribot_personality.yaml" : {"name" : "KisanVaani/agriculture-qa-english-only" , "split" : "train" , "col" : "question" },
3232 "healthcare_personality.yaml" : {"name" : "medalpaca/medical_meadow_medical_flashcards" , "split" : "train" , "col" : "input" },
33- "bartender_personality.yaml" : {"name" : str (Path (__file__ ).parent / "bartender_personality.txt" ), "col" : "text" }
33+ "bartender_personality.yaml" : {"name" : str (Path (__file__ ).parent / "bartender_personality.txt" ), "col" : "text" },
34+ "culinara_personality.yaml" : {"name" : str (Path (__file__ ).parent / "culinara_personality.txt" ), "col" : "text" },
35+ "tutor_personality.yaml" : {"name" : str (Path (__file__ ).parent / "tutor_personality.txt" ), "col" : "text" }
3436}
3537MODEL_DIR = Path ("model" )
3638
@@ -56,12 +58,8 @@ def compute_deepeval_hallucination(inputs, outputs, contexts) -> float:
5658 return avg_score
5759
5860
59- def extract_personality_path (path ):
60- return os .path .basename (path )
61-
62-
63- def prepare_dataset_and_model (chat_model_name , personality_file_path , auth_token ):
64- dataset_info = DATASET_MAPPING .get (extract_personality_path (personality_file_path ), "" )
61+ def prepare_dataset_and_model (chat_model_name : str , personality_file_path : Path , auth_token : str ):
62+ dataset_info = DATASET_MAPPING .get (personality_file_path .name , "" )
6563 assert dataset_info != ""
6664 log .info ("Loading dataset" )
6765 if dataset_info ["name" ].endswith (".txt" ):
@@ -113,26 +111,31 @@ def load_chat_model(model_name: str, token: str = None) -> OpenVINOLLM:
113111 model_kwargs = {"ov_config" : ov_config , "library_name" : "transformers" }, generate_kwargs = {"do_sample" : True , "temperature" : 0.7 , "top_k" : 50 , "top_p" : 0.95 })
114112
115113
116- def run_test_deepeval (chat_model_name , personality_file_path , auth_token ):
114+ def run_test_deepeval (chat_model_name : str , personality_file_path : Path , auth_token : str , selection_num : int = 10 ) -> float :
115+ """
116+ Args:
117+ chat_model_name (str): large language model path.
118+ personality_file_path (Path): personality file path.
119+ auth_token (str): auth token used for huggingface.
120+ selection_num (int): maximum number of prompt are selected to compute hallucination score
121+
122+ Returns:
123+ hallucination score: the higher the score, the higher possibility of having hallucination issue.
124+ """
117125 dataset_question , ov_chat_engine = prepare_dataset_and_model (chat_model_name , personality_file_path , auth_token )
118126 inputs = dataset_question
119127 # We use question as context because the dataset lacks context
120128 contexts = dataset_question
121129 contexts_res = [[context ] for context in contexts ]
122130
123- with open (personality_file_path , "rb" ) as f :
124- chatbot_config = yaml .safe_load (f )
125-
126- ov_llm = load_chat_model (chat_model_name , auth_token )
127- ov_chat_engine = SimpleChatEngine .from_defaults (llm = ov_llm , system_prompt = chatbot_config ["system_configuration" ],
128- memory = ChatMemoryBuffer .from_defaults ())
129131 outputs = []
130- for input in tqdm (inputs [:2 ]):
132+ for input in tqdm (inputs [:selection_num ]):
131133 output = ov_chat_engine .chat (input ).response
132134 outputs .append (output )
133135
134- final_score = compute_deepeval_hallucination (inputs [:2 ], outputs [:2 ], contexts_res [:2 ])
136+ final_score = compute_deepeval_hallucination (inputs [:selection_num ], outputs [:selection_num ], contexts_res [:selection_num ])
135137 print (f"final_score is { final_score } " )
138+ return final_score
136139
137140
138141class OVSelfCheckLLMPrompt (SelfCheckLLMPrompt ):
@@ -213,10 +216,10 @@ def run_test_selfcheckgpt(chat_model_name: str, personality_file_path: Path, aut
213216 parser .add_argument ("--personality" , type = str , default = "healthcare_personality.yaml" , help = "Path to the YAML file with chatbot personality" )
214217 parser .add_argument ("--hf_token" , type = str , help = "HuggingFace access token to get Llama3" )
215218 parser .add_argument ("--check_type" , type = str , choices = ["deepeval" , "selfcheckgpt" ], default = "deepeval" , help = "Hallucination check type" )
216- parser .add_argument ("--selection_num" , type = int , default = 10 , help = "Maximum number of prompt are selected to compute hallucination score" )
219+ parser .add_argument ("--selection_num" , type = int , default = 5 , help = "Maximum number of prompt are selected to compute hallucination score" )
217220
218221 args = parser .parse_args ()
219222 if args .check_type == "deepeval" :
220- run_test_deepeval (args .chat_model , Path (args .personality ), args .hf_token )
223+ run_test_deepeval (args .chat_model , Path (args .personality ), args .hf_token , args . selection_num )
221224 else :
222225 run_test_selfcheckgpt (args .chat_model , Path (args .personality ), args .hf_token , args .selection_num )
0 commit comments