from fi.evals import ContextRelevance, ContextRetrieval, Groundedness
from fi.testcases import TestCase
import pandas as pd
import time
def evaluate_context_relevance(df, question_col, context_col, model="gpt-4o-mini"):
"""
Evaluate context relevance for each row in the dataframe
"""
agentic_context_eval = ContextRelevance(config={"model": model, "check_internet": True})
results = []
for _, row in df.iterrows():
try:
test_case = TestCase(
input=row[question_col],
context=row[context_col]
)
result = evaluator.evaluate(eval_templates=[agentic_context_eval], inputs=[test_case], model_name="turing_flash")
time.sleep(2) # Rate limiting
results.append({'context_relevance': result.eval_results[0].metrics[0].value})
except Exception as e:
print(f"Error in context relevance evaluation: {e}")
results.append({'context_relevance': 'Error'})
return pd.DataFrame(results)
def evaluate_context_retrieval(df, question_col, context_col, response_col, model="gpt-4o-mini"):
"""
Evaluate context retrieval for each row in the dataframe
"""
agentic_retrieval_eval = ContextRetrieval(config={
"model": model,
"check_internet": True,
"criteria": "Check if the Context retrieved is relevant and accurate to the query and the response generated isn't incorrect"
})
results = []
for _, row in df.iterrows():
try:
test_case = TestCase(
input=row[question_col],
context=row[context_col],
output=row[response_col]
)
result = evaluator.evaluate(eval_templates=[agentic_retrieval_eval], inputs=[test_case], model_name="turing_flash")
time.sleep(2) # Rate limiting
results.append({'context_retrieval': result.eval_results[0].metrics[0].value})
except Exception as e:
print(f"Error in context retrieval evaluation: {e}")
results.append({'context_retrieval': 'Error'})
return pd.DataFrame(results)
def evaluate_groundedness(df, question_col, context_col, response_col, model="gpt-4o-mini"):
"""
Evaluate groundedness for each row in the dataframe
"""
agentic_groundedness_eval = Groundedness(config={"model": model, "check_internet": True})
results = []
for _, row in df.iterrows():
try:
test_case = TestCase(
input=row[question_col],
context=row[context_col],
response=row[response_col]
)
result = evaluator.evaluate(eval_templates=[agentic_groundedness_eval], inputs=[test_case], model_name="turing_flash")
time.sleep(2) # Rate limiting
results.append({'Groundedness': result.eval_results[0].metrics[0].value})
except Exception as e:
print(f"Error in groundedness evaluation: {e}")
results.append({'Groundedness': 'Error'})
return pd.DataFrame(results)
def run_all_evaluations(df, question_col, context_col, response_col, model="gpt-4o-mini"):
"""
Run all three evaluations and combine results
"""
relevance_results = evaluate_context_relevance(df, question_col, context_col, model)
retrieval_results = evaluate_context_retrieval(df, question_col, context_col, response_col, model)
groundedness_results = evaluate_groundedness(df, question_col, context_col, response_col, model)
# Combine all results with original dataframe
return pd.concat([df, relevance_results, retrieval_results, groundedness_results], axis=1)