Use multiple sheets from ab excel file in reports (#115)

enryH · sayalaruano · web-flow · commit 698cf2e35a23 · 2025-05-20T08:36:24.000+02:00
* 🎨 table_utils module, textwrapping and naming vars

- clean-up and understanding the workflow

* 📝 where should the caption come from?

* ✨🚧 first draft to use multiple sheets from excel file in quarto reports

* 🚧 add streamlit support using selection dropdown menu

* 📝 To discuss

* Merge branch 'main' into xlsx_with_multiple_sheets

🐛 quarto report is broken atm, relative paths do not work

* 🐛 using the dataframe while qmd generation, need to redefine path

- loading sheets on report generation makes it necessary have the path w.r.t to the folder on report generation
- qmd notebook is moved to quart_report folder in outfolder, where the path is needed

To check if this works fine with output folder definitions...

* ✅ add excel table with two sheets

- seems to be not working in docx properly (sheet is not changed)

* ✅ add a xlsx example

* 🐛 Fix(quarto_reportview.py): avoid showing suplicated sheets and remove Ipython prompts in static reports

---------

Co-authored-by: sayalaruano &lt;sebasar1245@gmail.com&gt;
diff --git a/docs/example_data/Basic_example_vuegen_demo_notebook/2_Dataframes/1_All_formats/2_abundance_table_example_xls.xls b/docs/example_data/Basic_example_vuegen_demo_notebook/2_Dataframes/1_All_formats/2_abundance_table_example_xls.xls
diff --git a/docs/example_data/Basic_example_vuegen_demo_notebook/2_Dataframes/1_All_formats/5_example_xlsx.xlsx b/docs/example_data/Basic_example_vuegen_demo_notebook/2_Dataframes/1_All_formats/5_example_xlsx.xlsx
diff --git a/src/vuegen/config_manager.py b/src/vuegen/config_manager.py
@@ -74,7 +74,7 @@ def _create_component_config_fromfile(self, file_path: Path) -> Dict[str, str]:
             file_path.resolve().as_posix()
         )  # ! needs to be posix for all OS support
         component_config["description"] = ""
-        component_config["caption"] = ""
+        component_config["caption"] = ""  # ? It is not populated here
 
         # Infer component config
         if file_ext in [
diff --git a/src/vuegen/quarto_reportview.py b/src/vuegen/quarto_reportview.py
@@ -1,13 +1,14 @@
 import os
 import subprocess
 import sys
+import textwrap
 from pathlib import Path
-from typing import List
+from typing import List, Optional
 
 import networkx as nx
-import pandas as pd
 
 from . import report as r
+from . import table_utils
 from .utils import create_folder, get_relative_file_path, is_url, sort_imports
 
 
@@ -271,10 +272,6 @@ def run_report(self, output_dir: str = BASE_DIR) -> None:
                 [self.quarto_path, "install", "tinytex", "--no-prompt"],
                 check=True,
             )
-            subprocess.run(
-                [self.quarto_path, "install", "chromium", "--no-prompt"],
-                check=True,
-            )
         try:
             subprocess.run(
                 args,
@@ -712,19 +709,16 @@ def _generate_dataframe_content(self, dataframe) -> List[str]:
 
         # Append header for DataFrame loading
         dataframe_content.append(
-            f"""```{{python}}
-#| label: '{dataframe.title} {dataframe.id}'
-#| fig-cap: ""
-"""
+            textwrap.dedent(
+                f"""\
+                ```{{python}}
+                #| label: '{dataframe.title} {dataframe.id}'
+                #| fig-cap: ""
+                """
+            )
         )
         # Mapping of file extensions to read functions
-        read_function_mapping = {
-            r.DataFrameFormat.CSV.value_with_dot: pd.read_csv,
-            r.DataFrameFormat.PARQUET.value_with_dot: pd.read_parquet,
-            r.DataFrameFormat.TXT.value_with_dot: pd.read_table,
-            r.DataFrameFormat.XLS.value_with_dot: pd.read_excel,
-            r.DataFrameFormat.XLSX.value_with_dot: pd.read_excel,
-        }
+        read_function_mapping = table_utils.read_function_mapping
         try:
             # Check if the file extension matches any DataFrameFormat value
             file_extension = Path(dataframe.file_path).suffix.lower()
@@ -740,24 +734,68 @@ def _generate_dataframe_content(self, dataframe) -> List[str]:
                 df_file_path = dataframe.file_path
             else:
                 df_file_path = get_relative_file_path(
-                    dataframe.file_path, base_path=".."
+                    dataframe.file_path,
                 )
+            sheet_names = None
+            # If the file is an Excel file, get the sheet names
+            if file_extension in [
+                r.DataFrameFormat.XLS.value_with_dot,
+                r.DataFrameFormat.XLSX.value_with_dot,
+            ]:
+                sheet_names = table_utils.get_sheet_names(df_file_path)
+                if len(sheet_names) > 1:
+                    # If there are multiple sheets, use the first one
+                    self.report.logger.info(
+                        f"Multiple sheets found in the Excel file: {df_file_path}. "
+                        f"Sheets: {sheet_names}"
+                    )
+                else:
+                    sheet_names = None
 
+            # Build the file path (URL or local file)
+            if is_url(dataframe.file_path):
+                df_file_path = dataframe.file_path
+            else:
+                df_file_path = get_relative_file_path(
+                    dataframe.file_path, base_path=".."
+                )
             # Load the DataFrame using the correct function
             read_function = read_function_mapping[file_extension]
             dataframe_content.append(
                 f"""df = pd.{read_function.__name__}('{df_file_path.as_posix()}')\n"""
             )
-
             # Display the dataframe
             dataframe_content.extend(self._show_dataframe(dataframe))
 
+            # Add further sheets
+            if sheet_names:
+                for sheet_name in sheet_names[1:]:
+                    dataframe_content.append(f"#### {sheet_name}")
+                    dataframe_content.append(
+                        textwrap.dedent(
+                            f"""\
+                    ```{{python}}
+                    #| label: '{dataframe.title} {dataframe.id} {sheet_name}'
+                    #| fig-cap: ""
+                    """
+                        )
+                    )
+                    dataframe_content.append(
+                        f"df = pd.{read_function.__name__}('{df_file_path.as_posix()}', "
+                        f"sheet_name='{sheet_name}')\n"
+                    )
+                    # Display the dataframe
+                    dataframe_content.extend(
+                        self._show_dataframe(dataframe, suffix=sheet_name)
+                    )
+
         except Exception as e:
             self.report.logger.error(
                 f"Error generating content for DataFrame: {dataframe.title}. Error: {str(e)}"
             )
             raise
         # Add caption if available
+        # ? Where should this come from?
         if dataframe.caption:
             dataframe_content.append(f">{dataframe.caption}\n")
 
@@ -787,18 +825,24 @@ def _generate_markdown_content(self, markdown) -> List[str]:
         try:
             # Initialize md code with common structure
             markdown_content.append(
-                f"""
-```{{python}}
-#| label: '{markdown.title} {markdown.id}'
-#| fig-cap: ""\n"""
+                textwrap.dedent(
+                    f"""
+                    ```{{python}}
+                    #| label: '{markdown.title} {markdown.id}'
+                    #| fig-cap: ""
+                    """
+                )
             )
             # If the file path is a URL, generate code to fetch content via requests
             if is_url(markdown.file_path):
                 markdown_content.append(
-                    f"""
-response = requests.get('{markdown.file_path}')
-response.raise_for_status()
-markdown_content = response.text\n"""
+                    textwrap.dedent(
+                        f"""\
+                    response = requests.get('{markdown.file_path}')
+                    response.raise_for_status()
+                    markdown_content = response.text
+                    """
+                    )
                 )
             else:  # If it's a local file
                 md_rel_path = get_relative_file_path(markdown.file_path, base_path="..")
@@ -826,14 +870,17 @@ def _generate_markdown_content(self, markdown) -> List[str]:
         )
         return markdown_content
 
-    def _show_dataframe(self, dataframe) -> List[str]:
+    def _show_dataframe(self, dataframe, suffix: Optional[str] = None) -> List[str]:
         """
         Appends either a static image or an interactive representation of a DataFrame to the content list.
 
         Parameters
         ----------
         dataframe : DataFrame
             The DataFrame object containing the data to display.
+        suffix : str, optional
+            A suffix to append to the DataFrame image file name like a sheet name
+            or another identifier (default is None).
 
         Returns
         -------
@@ -843,14 +890,19 @@ def _show_dataframe(self, dataframe) -> List[str]:
         dataframe_content = []
         if self.is_report_static:
             # Generate path for the DataFrame image
-            df_image = (
-                Path(self.static_dir) / f"{dataframe.title.replace(' ', '_')}.png"
-            )
+            fpath_df_image = Path(self.static_dir) / dataframe.title.replace(" ", "_")
+            if suffix:
+                fpath_df_image = fpath_df_image.with_stem(
+                    fpath_df_image.stem + f"_{suffix.replace(' ', '_')}"
+                )
+            fpath_df_image = fpath_df_image.with_suffix(".png")
+
             dataframe_content.append(
-                f"df.dfi.export('{Path(df_image).relative_to('quarto_report').as_posix()}', max_rows=10, max_cols=5, table_conversion='matplotlib')\n```\n"
+                f"df.dfi.export('{Path(fpath_df_image).relative_to('quarto_report').as_posix()}',"
+                " max_rows=10, max_cols=5, table_conversion='matplotlib')\n```\n"
             )
             # Use helper method to add centered image content
-            dataframe_content.append(self._generate_image_content(df_image))
+            dataframe_content.append(self._generate_image_content(fpath_df_image))
         else:
             # Append code to display the DataFrame interactively
             dataframe_content.append(
@@ -961,10 +1013,13 @@ def _generate_component_imports(self, component: r.Component) -> List[str]:
                     "import json",
                 ],
             },
-            "dataframe": [
+            "static_dataframe": [
                 "import pandas as pd",
-                "from itables import show, init_notebook_mode",
                 "import dataframe_image as dfi",
+            ],
+            "interactive_dataframe": [
+                "import pandas as pd",
+                "from itables import show, init_notebook_mode",
                 "init_notebook_mode(all_interactive=True)",
             ],
             "markdown": ["import IPython.display as display", "import requests"],
@@ -980,7 +1035,10 @@ def _generate_component_imports(self, component: r.Component) -> List[str]:
             if plot_type in components_imports["plot"]:
                 component_imports.extend(components_imports["plot"][plot_type])
         elif component_type == r.ComponentType.DATAFRAME:
-            component_imports.extend(components_imports["dataframe"])
+            if self.is_report_static:
+                component_imports.extend(components_imports["static_dataframe"])
+            else:
+                component_imports.extend(components_imports["interactive_dataframe"])
         elif component_type == r.ComponentType.MARKDOWN:
             component_imports.extend(components_imports["markdown"])
 
diff --git a/src/vuegen/streamlit_reportview.py b/src/vuegen/streamlit_reportview.py
@@ -5,10 +5,10 @@
 from pathlib import Path
 from typing import List
 
-import pandas as pd
 from streamlit.web import cli as stcli
 
 from . import report as r
+from . import table_utils
 from .utils import create_folder, generate_footer, get_relative_file_path, is_url
 from .utils.variables import make_valid_identifier
 
@@ -721,13 +721,7 @@ def _generate_dataframe_content(self, dataframe) -> List[str]:
         )
 
         # Mapping of file extensions to read functions
-        read_function_mapping = {
-            r.DataFrameFormat.CSV.value_with_dot: pd.read_csv,
-            r.DataFrameFormat.PARQUET.value_with_dot: pd.read_parquet,
-            r.DataFrameFormat.TXT.value_with_dot: pd.read_table,
-            r.DataFrameFormat.XLS.value_with_dot: pd.read_excel,
-            r.DataFrameFormat.XLSX.value_with_dot: pd.read_excel,
-        }
+        read_function_mapping = table_utils.read_function_mapping
 
         try:
             # Check if the file extension matches any DataFrameFormat value
@@ -738,19 +732,47 @@ def _generate_dataframe_content(self, dataframe) -> List[str]:
                 self.report.logger.error(
                     f"Unsupported file extension: {file_extension}. Supported extensions are: {', '.join(fmt.value for fmt in r.DataFrameFormat)}."
                 )
-
-            # Load the DataFrame using the correct function
-            read_function = read_function_mapping[file_extension]
+                # return []  # Skip execution if unsupported file extension
+                # Should it not return here? Can we even call the method with an unsupported file extension?
 
             # Build the file path (URL or local file)
             if is_url(dataframe.file_path):
                 df_file_path = dataframe.file_path
             else:
                 df_file_path = get_relative_file_path(dataframe.file_path)
-            dataframe_content.append(
-                f"""df = pd.{read_function.__name__}('{df_file_path.as_posix()}')\n"""
-            )
 
+            if file_extension in [
+                r.DataFrameFormat.XLS.value_with_dot,
+                r.DataFrameFormat.XLSX.value_with_dot,
+            ]:
+                dataframe_content.append("selected_sheet = 0")
+                sheet_names = table_utils.get_sheet_names(dataframe.file_path)
+                if len(sheet_names) > 1:
+                    # If there are multiple sheets, ask the user to select one
+
+                    dataframe_content.append(
+                        textwrap.dedent(
+                            f"""\
+                        sheet_names = table_utils.get_sheet_names("{dataframe.file_path}")
+                        selected_sheet = st.selectbox("Select a sheet to display", options=sheet_names)
+                        """
+                        )
+                    )
+
+            # Load the DataFrame using the correct function
+            read_function = read_function_mapping[file_extension]
+            if file_extension in [
+                r.DataFrameFormat.XLS.value_with_dot,
+                r.DataFrameFormat.XLSX.value_with_dot,
+            ]:
+                dataframe_content.append(
+                    f"""df = pd.{read_function.__name__}('{dataframe.file_path}', sheet_name=selected_sheet)\n"""
+                )
+            else:
+                dataframe_content.append(
+                    f"""df = pd.{read_function.__name__}('{df_file_path.as_posix()}')\n"""
+                )
+            # ! Alternative to select box: iterate over sheets in DataFrame
             # Displays a DataFrame using AgGrid with configurable options.
             dataframe_content.append(
                 """
@@ -1169,6 +1191,7 @@ def _generate_component_imports(self, component: r.Component) -> List[str]:
             "dataframe": [
                 "import pandas as pd",
                 "from st_aggrid import AgGrid, GridOptionsBuilder",
+                "from vuegen import table_utils",
             ],
             "markdown": ["import requests"],
             "chatbot": ["import time", "import json", "import requests"],
diff --git a/src/vuegen/table_utils.py b/src/vuegen/table_utils.py
@@ -0,0 +1,30 @@
+import pandas as pd
+
+from . import report as r
+
+# Mapping of file extensions to read functions
+read_function_mapping = {
+    r.DataFrameFormat.CSV.value_with_dot: pd.read_csv,
+    r.DataFrameFormat.PARQUET.value_with_dot: pd.read_parquet,
+    r.DataFrameFormat.TXT.value_with_dot: pd.read_table,
+    r.DataFrameFormat.XLS.value_with_dot: pd.read_excel,
+    r.DataFrameFormat.XLSX.value_with_dot: pd.read_excel,
+}
+
+
+def get_sheet_names(
+    file_path: str,
+) -> list[str]:
+    """Get the sheet names of an Excel file.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the Excel file.
+
+    Returns
+    -------
+    list[str]
+        List of sheet names.
+    """
+    return pd.ExcelFile(file_path).sheet_names