From bfd479a50bb31b2c1827098087dc81afe13ba6a3 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Mon, 19 Feb 2024 17:13:05 +0100 Subject: feat(benchmark): Add reports/format.py script to convert report.json to markdown --- benchmark/reports/format.py | 136 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 benchmark/reports/format.py (limited to 'benchmark') diff --git a/benchmark/reports/format.py b/benchmark/reports/format.py new file mode 100644 index 000000000..8a8cf945f --- /dev/null +++ b/benchmark/reports/format.py @@ -0,0 +1,136 @@ +import click + +from agbenchmark.reports.processing.report_types import Report + + +@click.command() +@click.argument("report_json_file", type=click.Path(exists=True, dir_okay=False)) +def print_markdown_report(report_json_file: str): + """ + Generates a Markdown report from a given report.json file. + + :param report_json_file: Path to the report.json file. + :return: A string containing the Markdown formatted report. + """ + report = Report.parse_file(report_json_file) + + # Header and metadata + click.echo(f"# Benchmark Report") + click.echo(f"- ⌛ **Run time:** `{report.metrics.run_time}`") + click.echo( + f" - **Started at:** `{report.benchmark_start_time[:16].replace('T', '` `')}`" + ) + if report.completion_time: + click.echo( + f" - **Completed at:** `{report.completion_time[:16].replace('T', '` `')}`" + ) + if report.metrics.total_cost: + click.echo(f"- 💸 **Total cost:** `${round(report.metrics.total_cost, 2)}`") + click.echo( + f"- 🏅 **Highest achieved difficulty:** `{report.metrics.highest_difficulty}`" + ) + click.echo(f"- ⚙️ **Command:** `{report.command}`") + + click.echo() # spacing + + # Aggregate information + successful, failed, unreliable = [], [], [] + for test in report.tests.values(): + test.metrics.success_percentage = ( + rsp + if (rsp := test.metrics.success_percentage) is not None + else sum(float(r.success or 0) for r in test.results) + * 100 + / len(test.results) + ) + if test.metrics.success_percentage == 100.0: + successful.append(test) + elif test.metrics.success_percentage == 0.0: + failed.append(test) + else: + unreliable.append(test) + + # Summary + click.echo("## Summary") + click.echo(f"- **`{len(successful)}` passed** {'✅'*len(successful)}") + click.echo(f"- **`{len(failed)}` failed** {'❌'*len(failed)}") + click.echo(f"- **`{len(unreliable)}` unreliable** {'⚠️'*len(unreliable)}") + + click.echo() # spacing + + # Test results + click.echo("## Challenges") + for test_name, test in report.tests.items(): + click.echo() # spacing + + result_indicator = ( + "✅" + if test.metrics.success_percentage == 100.0 + else "⚠️" + if test.metrics.success_percentage > 0 + else "❌" + ) + click.echo( + f"### {test_name} {result_indicator if test.metrics.attempted else '❔'}" + ) + click.echo(f"{test.description}") + + click.echo() # spacing + + click.echo(f"- **Attempted:** {'Yes 👍' if test.metrics.attempted else 'No 👎'}") + click.echo( + f"- **Success rate:** {round(test.metrics.success_percentage)}% " + f"({len([r for r in test.results if r.success])}/{len(test.results)})" + ) + click.echo(f"- **Difficulty:** `{test.difficulty}`") + click.echo(f"- **Categories:** `{'`, `'.join(test.category)}`") + click.echo( + f"
\nTask (click to expand)\n\n" + f"{indent('> ', test.task)}\n\n" + f"Reference answer:\n{indent('> ', test.answer)}\n" + "
" + ) + + click.echo() # spacing + + click.echo("\n#### Attempts") + for i, attempt in enumerate(test.results, 1): + click.echo( + f"\n{i}. **{'✅ Passed' if attempt.success else '❌ Failed'}** " + f"in **{attempt.run_time}** " + f"and **{quantify('step', attempt.n_steps)}**\n" + ) + click.echo(f" - **Cost:** `${round(attempt.cost, 3)}`") + if attempt.fail_reason: + click.echo( + " - **Failure reason:**\n" + + indent(" > ", attempt.fail_reason) + + "\n" + ) + if attempt.steps: + click.echo( + indent( + 3 * " ", + "
\nSteps\n", + ) + ) + for j, step in enumerate(attempt.steps, 1): + click.echo() + click.echo( + indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}") + ) + click.echo("\n
") + + +def indent(indent: str, text: str, prefix_indent: bool = True) -> str: + return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent) + + +def quantify(noun: str, count: int, plural_suffix: str = "s") -> str: + if count == 1: + return f"{count} {noun}" + return f"{count} {noun}{plural_suffix}" + + +if __name__ == "__main__": + print_markdown_report() -- cgit v1.2.3