def evaluate_research_with_tracing(research_output: str, context: str) -> Dict[str, Any]:
"""Evaluate research quality with integrated tracing"""
with tracer.start_as_current_span("research_evaluation") as span:
# Set attributes for the span
span.set_attribute("raw.input", context)
span.set_attribute("raw.output", research_output)
span.set_attribute("evaluation.type", "research_quality")
# Evaluation 1: Completeness Check
completeness_config = {
"eval_templates": "completeness",
"inputs": {
"input": context,
"output": research_output,
},
"model_name": "turing_large"
}
completeness_result = evaluator.evaluate(
**completeness_config,
custom_eval_name="research_completeness",
trace_eval=True
)
# Evaluation 2: Factual Accuracy
factual_config = {
"eval_templates": "factual_accuracy",
"inputs": {
"input": context,
"output": research_output,
},
"model_name": "turing_large"
}
factual_result = evaluator.evaluate(
**factual_config,
custom_eval_name="research_factual_accuracy",
trace_eval=True
)
# Evaluation 3: Relevance Check
relevance_config = {
"eval_templates": "context_relevance",
"inputs": {
"context": context,
"output": research_output,
},
"model_name": "turing_large"
}
relevance_result = evaluator.evaluate(
**relevance_config,
custom_eval_name="research_relevance",
trace_eval=True
)
# Aggregate results
eval_results = {
"completeness": completeness_result,
"factual_accuracy": factual_result,
"relevance": relevance_result,
"overall_score": (
completeness_result.get("score", 0) +
factual_result.get("score", 0) +
relevance_result.get("score", 0)
) / 3
}
# Set evaluation results as span attributes
span.set_attribute("evaluation.overall_score", eval_results["overall_score"])
return eval_results
def evaluate_report_quality(report: str, requirements: str) -> Dict[str, Any]:
"""Evaluate final report quality"""
with tracer.start_as_current_span("report_evaluation") as span:
span.set_attribute("raw.input", requirements)
span.set_attribute("raw.output", report)
# Evaluation 1: Structure and Clarity
clarity_config = {
"eval_templates": "is_concise",
"inputs": {
"output": report,
},
"model_name": "turing_large"
}
clarity_result = evaluator.evaluate(
**clarity_config,
custom_eval_name="report_clarity",
trace_eval=True
)
# Evaluation 2: Instruction Adherence
instruction_config = {
"eval_templates": "instruction_adherence",
"inputs": {
"input": requirements,
"output": report,
},
"model_name": "turing_large"
}
instruction_result = evaluator.evaluate(
**instruction_config,
custom_eval_name="report_instruction_adherence",
trace_eval=True
)
# Evaluation 3: Groundedness (no hallucinations)
groundedness_config = {
"eval_templates": "groundedness",
"inputs": {
"input": requirements,
"output": report,
},
"model_name": "turing_large"
}
groundedness_result = evaluator.evaluate(
**groundedness_config,
custom_eval_name="report_groundedness",
trace_eval=True
)
return {
"clarity": clarity_result,
"instruction_adherence": instruction_result,
"groundedness": groundedness_result
}