# Create a bar plot with error bars for the average values of "s" and "f" for each criterion
plt.figure(figsize=(12, 8))
bar_width = 0.1
index = np.arange(len(criteria))
plt.bar(
index,
list(average_s.values()),
bar_width,
label=f"success ({len(task['s'])} samples)",
color="darkblue",
yerr=[(avg - conf_interval_s[key][0]) for key, avg in average_s.items()],
capsize=5,
)
plt.bar(
index + bar_width,
list(average_f.values()),
bar_width,
label=f"failed ({len(task['f'])} samples)",
color="lightblue",
yerr=[(avg - conf_interval_f[key][0]) for key, avg in average_f.items()],
capsize=5,
)
plt.xlabel("Criteria", fontsize=16)
plt.ylabel("Average Value", fontsize=16)
plt.title(
"Average Values of 3 different baselines cases with 95% Confidence Intervals - math problems ", fontsize=12, pad=10
) # Adjust titlepad to move the title further above
plt.xticks(index + bar_width / 2, [crit.name for crit in criteria], rotation=45, fontsize=14)
plt.legend(loc="upper center", fontsize=14, bbox_to_anchor=(0.5, 1), ncol=3) # Adjust legend placement and ncol
plt.tight_layout() # Adjust subplot parameters to fit the labels
plt.ylim(0, 5)
plt.savefig("../test/test_files/agenteval-in-out/estimated_performance.png")
plt.show()