Feat/dashboard (Marker-Inc-Korea#432)

* 🔧 chore: Add autorag dashboard command and dependencies for panel, holoviews, hvplot * ✨ feat: Add initial dashboard setup with DuckDB data retrieval and plotting. * add duckdb to requirements.txt * add find_trial_dir at util.py * add find_node_summary_files util function * ✨ feat: Update dashboard layout and styling, add new charts and widgets * add dict_to_markdown util function * show summary of trial & work dynamic trial directory on cli.py * add dict_to_markdown_table feature for metric display * make dashboard with each nodes boxplot and stripplot * add config YAML file tab * delete unused annotation and requirements * remove unused value and test it on the cli command * edit documentation and README.md for dashboard * try-except when the result summary file is something wrong in the node_view --------- Co-authored-by: Bwook (Byoungwook) Kim <bwook00@naver.com> Co-authored-by: jeffrey <vkefhdl1@gmail.com> Co-authored-by: Jeffrey (Dongkyu) Kim <vkehfdl1@gmail.com>
olabi · May 21, 2024 · 46b6031 · 46b6031
1 parent cbae901
commit 46b6031
Show file tree

Hide file tree

Showing 8 changed files with 329 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -32,7 +32,8 @@ You can see on [YouTube](https://youtu.be/2ojK8xjyXAU?feature=shared)
   - [1. Prepare your evaluation data](#1-prepare-your-evaluation-data)
   - [2. Evaluate your data to various RAG modules](#2-evaluate-your-data-to-various-rag-modules)
   - [3. Use a found optimal RAG pipeline](#3-use-a-found-optimal-rag-pipeline)
-  - [4. Share your RAG pipeline](#4-share-your-rag-pipeline)
+  - [4. Run Dashboard to see the result](#4-run-dashboard)
+  - [5. Share your RAG pipeline](#5-share-your-rag-pipeline)
   - [+ Config yaml file](#-create-your-own-config-yaml-file)
 - [Supporting RAG modules](#supporting-nodes--modules)
 - [Roadmap](#roadmap)
@@ -162,7 +163,15 @@ You can run api server with CLI command.
 autorag run_api --config_path your/path/to/pipeline.yaml --host 0.0.0.0 --port 8000
 ```
 
-### 4. Share your RAG pipeline
+### 4. Run Dashboard
+
+You can run dashboard to easily see the result.
+
+```bash
+autorag dashboard --trial_dir /your/path/to/trial_dir
+```
+
+### 5. Share your RAG pipeline
 
 You can use your RAG pipeline from extracted pipeline yaml file.
 This extracted pipeline is great for sharing your RAG pipeline to others.
@@ -252,12 +261,12 @@ at [here](https://edai.notion.site/Supporting-metrics-867d71caefd7401c9264dd91ba
 # 🛣Roadmap
 
 - [ ] Policy Module for modular RAG pipeline
-- [ ] Visualize evaluation result
+- [x] Visualize evaluation result
 - [ ] Visualize config yaml file
-- [ ] More RAG modules support
+- [x] More RAG modules support
 - [x] Token usage strategy
 - [ ] Multi-modal support
-- [ ] More evaluation metrics
+- [x] More evaluation metrics
 - [ ] Answer Filtering Module
 - [x] Restart optimization from previous trial
 

diff --git a/autorag/cli.py b/autorag/cli.py
@@ -8,6 +8,7 @@
 
 import click
 
+from autorag import dashboard
 from autorag.deploy import Runner
 from autorag.deploy import extract_best_config as original_extract_best_config
 from autorag.evaluator import Evaluator
@@ -70,6 +71,12 @@ def run_web(yaml_path: Optional[str], project_dir: Optional[str], trial_path: Op
         subprocess.run(['streamlit', 'run', web_py_path, '--', '--trial_path', trial_path])
 
 
+@click.command()
+@click.option('--trial_dir', type=click.Path(dir_okay=True, file_okay=False, exists=True), required=True)
+def run_dashboard(trial_dir: str):
+    dashboard.run(trial_dir)
+
+
 @click.command()
 @click.option('--trial_path', type=click.Path(), help='Path to the trial directory.')
 @click.option('--output_path', type=click.Path(), help='Path to the output directory.'
@@ -94,6 +101,7 @@ def restart_evaluate(trial_path):
 cli.add_command(evaluate, 'evaluate')
 cli.add_command(run_api, 'run_api')
 cli.add_command(run_web, 'run_web')
+cli.add_command(run_dashboard, 'dashboard')
 cli.add_command(extract_best_config, 'extract_best_config')
 cli.add_command(restart_evaluate, 'restart_evaluate')
 

diff --git a/autorag/dashboard.py b/autorag/dashboard.py
@@ -0,0 +1,132 @@
+import ast
+import logging
+import os
+from typing import Dict, List
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import panel as pn
+import seaborn as sns
+import yaml
+
+from autorag.utils.util import dict_to_markdown, dict_to_markdown_table
+
+pn.extension('terminal', 'tabulator', 'mathjax', 'ipywidgets',
+             console_output='disable', sizing_mode="stretch_width")
+logger = logging.getLogger("AutoRAG")
+
+
+def find_node_dir(trial_dir: str) -> List[str]:
+    trial_summary_df = pd.read_csv(os.path.join(trial_dir, 'summary.csv'))
+    result_paths = []
+    for idx, row in trial_summary_df.iterrows():
+        node_line_name = row['node_line_name']
+        node_type = row['node_type']
+        result_paths.append(os.path.join(trial_dir, node_line_name, node_type))
+    return result_paths
+
+
+def get_metric_values(node_summary_df: pd.DataFrame) -> Dict:
+    non_metric_column_names = ['filename', 'module_name', 'module_params', 'execution_time', 'average_output_token',
+                               'is_best']
+    best_row = node_summary_df.loc[node_summary_df['is_best']].drop(columns=non_metric_column_names, errors='ignore')
+    assert len(best_row) == 1, "The best module must be only one."
+    return best_row.iloc[0].to_dict()
+
+
+def make_trial_summary_md(trial_dir):
+    markdown_text = f"""# Trial Result Summary
+- Trial Directory : {trial_dir}
+
+"""
+    node_dirs = find_node_dir(trial_dir)
+    for node_dir in node_dirs:
+        node_summary_filepath = os.path.join(node_dir, 'summary.csv')
+        node_type = os.path.basename(node_dir)
+        node_summary_df = pd.read_csv(node_summary_filepath)
+        best_row = node_summary_df.loc[node_summary_df['is_best']].iloc[0]
+        metric_dict = get_metric_values(node_summary_df)
+        markdown_text += f"""---
+
+## {node_type} best module
+
+### Module Name
+
+{best_row['module_name']}
+
+### Module Params
+
+{dict_to_markdown(ast.literal_eval(best_row['module_params']), level=3)}
+
+### Metric Values
+
+{dict_to_markdown_table(metric_dict, key_column_name='metric_name', value_column_name='metric_value')}
+
+"""
+
+    return markdown_text
+
+
+def node_view(node_dir: str):
+    non_metric_column_names = ['filename', 'module_name', 'module_params', 'execution_time', 'average_output_token',
+                               'is_best']
+    summary_df = pd.read_csv(os.path.join(node_dir, 'summary.csv'))
+    df_widget = pn.widgets.Tabulator(summary_df, name='Summary DataFrame')
+    # TODO: add on click listener for pop-up of each file detail.
+    # https://panel.holoviz.org/reference/widgets/Tabulator.html
+
+    try:
+        fig, ax = plt.subplots(figsize=(10, 5))
+        metric_df = summary_df.drop(columns=non_metric_column_names, errors='ignore')
+        sns.stripplot(data=metric_df, ax=ax)
+        strip_plot_pane = pn.pane.Matplotlib(fig, tight=True)
+
+        fig2, ax2 = plt.subplots(figsize=(10, 5))
+        sns.boxplot(data=metric_df, ax=ax2)
+        box_plot_pane = pn.pane.Matplotlib(fig2, tight=True)
+        plot_pane = pn.Row(strip_plot_pane, box_plot_pane)
+
+        layout = pn.Column("## Summary distribution plot", plot_pane, "## Summary DataFrame", df_widget)
+    except Exception as e:
+        logger.error(f'Skipping make boxplot and stripplot with error {e}')
+        layout = pn.Column("## Summary DataFrame", df_widget)
+    layout.servable()
+    return layout
+
+
+CSS = """
+div.card-margin:nth-child(1) {
+    max-height: 300px;
+}
+div.card-margin:nth-child(2) {
+    max-height: 400px;
+}
+"""
+
+
+def yaml_to_markdown(yaml_filepath):
+    markdown_content = ""
+    with open(yaml_filepath, 'r', encoding='utf-8') as file:
+        try:
+            content = yaml.safe_load(file)
+            markdown_content += f"## {os.path.basename(yaml_filepath)}\n```yaml\n{yaml.dump(content, allow_unicode=True)}\n```\n\n"
+        except yaml.YAMLError as exc:
+            print(f"Error in {yaml_filepath}: {exc}")
+    return markdown_content
+
+
+def run(trial_dir: str):
+    trial_summary_md = make_trial_summary_md(trial_dir=trial_dir)
+    trial_summary_tab = pn.pane.Markdown(trial_summary_md, sizing_mode='stretch_width')
+
+    node_views = [(str(os.path.basename(node_dir)), node_view(node_dir)) for node_dir in find_node_dir(trial_dir)]
+
+    yaml_file_markdown = yaml_to_markdown(os.path.join(trial_dir, "config.yaml"))
+
+    yaml_file = pn.pane.Markdown(yaml_file_markdown, sizing_mode='stretch_width')
+
+    tabs = pn.Tabs(('Summary', trial_summary_tab), *node_views, ('Used YAML file', yaml_file), dynamic=True)
+
+    template = pn.template.FastListTemplate(site="AutoRAG", title="Dashboard",
+                                            main=[tabs], raw_css=[CSS]).servable()
+    template.show()
diff --git a/autorag/utils/util.py b/autorag/utils/util.py
@@ -2,6 +2,7 @@
 import asyncio
 import datetime
 import functools
+import glob
 import itertools
 import logging
 import os
@@ -363,5 +364,69 @@ def split_dataframe(df, chunk_size):
     return list(map(lambda x: df[x * chunk_size:(x + 1) * chunk_size], range(num_chunks)))
 
 
+def find_trial_dir(project_dir: str) -> List[str]:
+    # Pattern to match directories named with numbers
+    pattern = os.path.join(project_dir, '[0-9]*')
+    all_entries = glob.glob(pattern)
+
+    # Filter out only directories
+    trial_dirs = [entry for entry in all_entries if os.path.isdir(entry) and entry.split(os.sep)[-1].isdigit()]
+
+    return trial_dirs
+
+
+def find_node_summary_files(trial_dir: str) -> List[str]:
+    # Find all summary.csv files recursively
+    all_summary_files = glob.glob(os.path.join(trial_dir, '**', 'summary.csv'), recursive=True)
+
+    # Filter out files that are at a lower directory level
+    filtered_files = [f for f in all_summary_files if f.count(os.sep) > trial_dir.count(os.sep) + 2]
+
+    return filtered_files
+
+
 def normalize_unicode(text: str) -> str:
     return unicodedata.normalize('NFC', text)
+
+
+def dict_to_markdown(d, level=1):
+    """
+    Convert a dictionary to a Markdown formatted string.
+
+    :param d: Dictionary to convert
+    :param level: Current level of heading (used for nested dictionaries)
+    :return: Markdown formatted string
+    """
+    markdown = ""
+    for key, value in d.items():
+        if isinstance(value, dict):
+            markdown += f"{'#' * level} {key}\n"
+            markdown += dict_to_markdown(value, level + 1)
+        elif isinstance(value, list):
+            markdown += f"{'#' * level} {key}\n"
+            for item in value:
+                if isinstance(item, dict):
+                    markdown += dict_to_markdown(item, level + 1)
+                else:
+                    markdown += f"- {item}\n"
+        else:
+            markdown += f"{'#' * level} {key}\n{value}\n"
+    return markdown
+
+
+def dict_to_markdown_table(data, key_column_name: str, value_column_name: str):
+    # Check if the input is a dictionary
+    if not isinstance(data, dict):
+        raise ValueError("Input must be a dictionary")
+
+    # Create the header of the table
+    header = f"| {key_column_name} | {value_column_name} |\n| :---: | :-----: |\n"
+
+    # Create the rows of the table
+    rows = ""
+    for key, value in data.items():
+        rows += f"| {key} | {value} |\n"
+
+    # Combine header and rows
+    markdown_table = header + rows
+    return markdown_table
diff --git a/docs/source/tutorial.md b/docs/source/tutorial.md
@@ -117,6 +117,15 @@ start_trial function will be executed instead of restart_trial.
 Note that a new trial folder will be created, not a new restart result in that Trial Path.
 ```
 
+## Run Dashboard to see your trial result!
+
+Up to AutoRAG version 0.2.0, you can use dashboard feature for easily see the results of AutoRAG.
+You can run dashboard just running below command.
+
+```bash
+autorag dashboard --trial_dir /your/path/to/trial_dir
+```
+
 ## Extract pipeline and evaluate test dataset
 
 Now, it's time to evaluate test dataset with a found RAG pipeline. For this, you can extract the optimal pipeline and

diff --git a/requirements.txt b/requirements.txt
@@ -49,3 +49,10 @@ streamlit
 
 ### Langchain ###
 langchain-core>=0.1.6
+
+# autorag dashboard    
+panel
+seaborn
+ipykernel
+ipywidgets
+ipywidgets_bokeh
diff --git a/tests/autorag/test_dashboard.py b/tests/autorag/test_dashboard.py
@@ -0,0 +1,32 @@
+import os
+import pathlib
+
+import pandas as pd
+import pytest
+
+from autorag.dashboard import get_metric_values, make_trial_summary_md
+
+root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent
+sample_project_dir = os.path.join(root_dir, 'resources', 'result_project')
+sample_trial_dir = os.path.join(sample_project_dir, '0')
+
+
+@pytest.fixture
+def retrieval_summary_df():
+    return pd.read_csv(os.path.join(sample_trial_dir, 'retrieve_node_line', 'retrieval', 'summary.csv'))
+
+
+def test_get_metric_values(retrieval_summary_df):
+    result_dict = get_metric_values(retrieval_summary_df)
+    assert len(result_dict.keys()) == 3
+    assert set(list(result_dict.keys())) == {'retrieval_f1', 'retrieval_recall', 'retrieval_precision'}
+    assert result_dict['retrieval_recall'] == 1.0
+    assert result_dict['retrieval_precision'] == 0.1
+
+
+def test_make_trial_summary_md():
+    md_text = make_trial_summary_md(sample_trial_dir)
+    assert bool(md_text)
+
+# def test_dashboard_run():
+#     dashboard.run(sample_trial_dir)