Advanced features: group_by¶

This example demonstrates advanced table formatting features in rtflite, focusing on the group_by functionality that provides enhanced readability by suppressing duplicate values within groups.

Overview¶

The group_by feature is particularly useful for clinical trial listings where multiple rows belong to the same subject or treatment group. Instead of repeating identical values in every row, group_by displays the value only once per group, leaving subsequent rows blank for better visual organization.

Key benefits

Improved readability: Reduces visual clutter by eliminating redundant information
Clinical standards compliance: Follows pharmaceutical industry conventions for listing formats
Hierarchical grouping: Supports multiple columns with nested group relationships

Imports¶

from importlib.resources import files

import polars as pl

import rtflite as rtf

Load and prepare adverse events data¶

Load the adverse events dataset and create a subset for demonstration:

# Load adverse events data from parquet file
data_path = files("rtflite.data").joinpath("adae.parquet")
df = pl.read_parquet(data_path)

# Take a subset of the data for this example (rows 200-260)
ae_subset = df.slice(200, 60)

Create additional columns for a more comprehensive listing format:

# Create formatted columns for the listing
ae_t1 = ae_subset.with_columns(
    [
        # Create subline header with study and site information
        (
            pl.lit("Trial Number: ")
            + pl.col("STUDYID")
            + pl.lit(", Site Number: ")
            + pl.col("SITEID").cast(pl.String)
        ).alias("SUBLINEBY"),
        # Create subject line with demographic information
        (
            pl.lit("Subject ID = ")
            + pl.col("USUBJID")
            + pl.lit(", Gender = ")
            + pl.col("SEX")
            + pl.lit(", Race = ")
            + pl.col("RACE")
            + pl.lit(", AGE = ")
            + pl.col("AGE").cast(pl.String)
            + pl.lit(" Years")
            + pl.lit(", TRT = ")
            + pl.col("TRTA")
        ).alias("SUBJLINE"),
        # Format adverse event term (title case)
        pl.col("AEDECOD").str.to_titlecase().alias("AEDECD1"),
        # Create duration string
        (pl.col("ADURN").cast(pl.String) + pl.lit(" ") + pl.col("ADURU")).alias("DUR"),
    ]
).select(
    [
        "SUBLINEBY",
        "TRTA",
        "SUBJLINE",
        "USUBJID",
        "ASTDY",
        "AEDECD1",
        "DUR",
        "AESEV",
        "AESER",
        "AEREL",
        "AEACN",
        "AEOUT",
    ]
)

# Sort by key variables to group related events together
ae_t1 = ae_t1.sort(["SUBLINEBY", "TRTA", "SUBJLINE", "USUBJID", "ASTDY"])

Demonstrate single column group_by¶

Start with a simple example using a single column for grouping:

# Create RTF document with single column group_by
doc_single = rtf.RTFDocument(
    df=ae_t1.select(["USUBJID", "AEDECD1", "AESEV", "AESER"])
    .head(15)
    .sort(["USUBJID", "AEDECD1"]),
    rtf_title=rtf.RTFTitle(
        text=["Adverse Events Listing", "Example 1: Single Column group_by"],
        text_convert=False,
    ),
    rtf_column_header=rtf.RTFColumnHeader(
        text=["Subject ID", "Adverse Event", "Severity", "Serious"],
        text_format="b",
        text_justification=["l", "l", "c", "c"],
    ),
    rtf_body=rtf.RTFBody(
        group_by=["USUBJID", "AEDECD1"],  # Group by subject ID and adverse event
        col_rel_width=[3, 4, 2, 2],
        text_justification=["l", "l", "c", "c"],
    ),
    rtf_footnote=rtf.RTFFootnote(
        text="Note: Subject ID and Adverse Event values are shown only once per group for better readability",
        text_convert=False,
    ),
)

# Generate the RTF file
doc_single.write_rtf("advanced-group-by-single.rtf")

Multi-page example with group context¶

Demonstrate how group_by works with pagination, including context restoration:

# Create larger dataset for multi-page demonstration
ae_large = ae_t1.head(100)  # Use more rows to trigger pagination

doc_multipage = rtf.RTFDocument(
    df=ae_large.select(["USUBJID", "ASTDY", "AEDECD1", "AESEV", "AESER"]).sort(
        ["USUBJID", "ASTDY"]
    ),
    rtf_page=rtf.RTFPage(nrow=25),  # Force pagination
    rtf_title=rtf.RTFTitle(
        text=["Adverse Events Listing", "Example 3: Multi-page with group_by"],
        text_convert=False,
    ),
    rtf_column_header=rtf.RTFColumnHeader(
        text=["Subject ID", "Study Day", "Adverse Event", "Severity", "Serious"],
        text_format="b",
        text_justification=["l", "c", "l", "c", "c"],
    ),
    rtf_body=rtf.RTFBody(
        group_by=["USUBJID", "ASTDY"],
        col_rel_width=[3, 1, 4, 2, 2],
        text_justification=["l", "c", "l", "c", "c"],
    ),
    rtf_footnote=rtf.RTFFootnote(
        text=[
            "Note: In multi-page listings, group context is automatically restored",
            "at the beginning of each new page for better readability.",
        ],
        text_convert=False,
    ),
)

# Generate the RTF file
doc_multipage.write_rtf("advanced-group-by-multipage.rtf")

Combining group_by with new_page (treatment separation)¶

Demonstrate the powerful combination of group_by and new_page for clinical trial reporting:

# Create treatment-separated document with group_by within each page
# Filter data to have multiple treatment groups
ae_with_treatments = (
    ae_t1.filter(pl.col("TRTA").is_in(["Placebo", "Xanomeline High Dose"]))
    .select(["TRTA", "USUBJID", "ASTDY", "AEDECD1", "AESEV"])
    .head(40)
    .sort(["TRTA", "USUBJID", "ASTDY"])
)

doc_treatment_separated = rtf.RTFDocument(
    df=ae_with_treatments,
    rtf_title=rtf.RTFTitle(
        text=[
            "Adverse Events Listing",
            "Example 5: group_by + new_page (Treatment Separation)",
        ],
        text_convert=False,
    ),
    rtf_column_header=rtf.RTFColumnHeader(
        text=["Subject ID", "Study Day", "Adverse Event", "Severity"],
        col_rel_width=[3, 1, 4, 2],
        text_format="b",
        text_justification=["l", "c", "l", "c"],
    ),
    rtf_body=rtf.RTFBody(
        page_by=["TRTA"],  # Separate pages by treatment
        new_page=True,  # Force new page for each treatment
        pageby_row="first_row",  # Display treatment as spanning row and remove column
        group_by=[
            "USUBJID",
            "ASTDY",
        ],  # Suppress duplicates within each treatment page
        col_rel_width=[2, 3, 1, 4, 2],
        border_top=["single", "", "", "", ""],
        border_bottom=["single", "", "", "", ""],
        text_justification=["l", "l", "c", "l", "c"],
        pageby_header=True,  # Repeat headers on each treatment page
    ),
    rtf_footnote=rtf.RTFFootnote(
        text=[
            "Example of group_by + new_page combination:",
            "- Each treatment group gets its own page(s) (new_page=True)",
            "- Within each treatment, USUBJID and ASTDY are suppressed when duplicate (group_by)",
            "- Headers are repeated on each treatment page (pageby_header=True)",
        ],
        text_convert=False,
    ),
)

# Generate the RTF file
doc_treatment_separated.write_rtf("advanced-group-by-group-newpage.rtf")

Demonstrating subline_by with subheader generation¶

The subline_by feature creates visually distinct subheader rows that group related data, making listings easier to read and follow:

# Create data with clear grouping structure for subline demonstration
ae_subline_data = (
    ae_t1.filter(pl.col("TRTA").is_in(["Placebo", "Xanomeline High Dose"]))
    .head(30)
    .sort(["SUBLINEBY", "TRTA", "USUBJID"])
)

# Create RTF document with subline_by to generate subheaders
doc_subline = rtf.RTFDocument(
    df=ae_subline_data.select(["SUBLINEBY", "USUBJID", "AEDECD1", "AESEV", "AESER"]),
    rtf_title=rtf.RTFTitle(
        text=[
            "Adverse Events Listing",
            "Example 6: subline_by with Subheader Generation",
        ],
        text_convert=False,
    ),
    rtf_column_header=rtf.RTFColumnHeader(
        text=[
            "Subject ID",
            "Adverse Event",
            "Severity",
            "Serious",
        ],  # Headers for remaining columns after SUBLINEBY removal
        col_rel_width=[3, 2, 4, 2,],  
        text_format="b",
        text_justification=["l", "l", "c", "c"],
    ),
    rtf_body=rtf.RTFBody(
        subline_by=["SUBLINEBY"],  # Creates subheader rows from SUBLINEBY values
        col_rel_width=[3, 2, 4, 2,],  
        text_justification=["l", "l", "l", "c", "c"],
    ),
    rtf_footnote=rtf.RTFFootnote(
        text=[
            "Note: subline_by creates subheader rows that span all columns",
            "- SUBLINEBY column values become bold subheader text",
            "- Original SUBLINEBY column is removed from table data",
            "- Subheaders provide clear visual grouping of related records",
        ],
        text_convert=False,
    ),
)

# Generate the RTF file
doc_subline.write_rtf("advanced-group-by-subline.rtf")

Advanced combination - subline_by with group_by¶

Demonstrate the powerful combination of subline_by and group_by for comprehensive clinical listings:

# Create data with multiple visits per subject for comprehensive demonstration
ae_comprehensive = (
    ae_t1.head(40)
    .with_columns(
        [
            # Add visit information to create multiple rows per subject
            pl.when(pl.int_range(pl.len()) % 3 == 0)
            .then(pl.lit("Visit 1"))
            .when(pl.int_range(pl.len()) % 3 == 1)
            .then(pl.lit("Visit 2"))
            .otherwise(pl.lit("Visit 3"))
            .alias("VISIT")
        ]
    )
    .sort(["SUBLINEBY", "USUBJID", "VISIT"])
)

doc_comprehensive = rtf.RTFDocument(
    df=ae_comprehensive.select(["SUBLINEBY", "USUBJID", "VISIT", "AEDECD1", "AESEV"]),
    rtf_title=rtf.RTFTitle(
        text=[
            "Adverse Events Listing",
            "Example 7: subline_by + group_by Comprehensive",
        ],
        text_convert=False,
    ),
    rtf_column_header=rtf.RTFColumnHeader(
        text=[
            "Subject ID",
            "Visit",
            "Adverse Event",
            "Severity",
        ],  # Headers for remaining columns after SUBLINEBY removal
        col_rel_width=[3, 2, 4, 2,],  
        text_format="b",
        text_justification=["l", "c", "l", "c"],
    ),
    rtf_body=rtf.RTFBody(
        subline_by=["SUBLINEBY"],  # Creates trial/site subheaders
        group_by=["USUBJID"],  # Suppresses duplicate subject IDs
        col_rel_width=[3, 2, 4, 2,],  
        text_justification=["l", "l", "c", "l", "c"],
    ),
    rtf_footnote=rtf.RTFFootnote(
        text=[
            "Advanced example combining subline_by and group_by:",
            "- SUBLINEBY creates bold subheader rows for trial/site information",
            "- group_by suppresses duplicate USUBJID values within each group",
            "- Result: Clear visual hierarchy with minimal redundancy",
        ],
        text_convert=False,
    ),
)

# Generate the RTF file
doc_comprehensive.write_rtf("advanced-group-by-comprehensive.rtf")

page_by with divider row filtering¶

A common scenario in clinical reporting is data that includes divider rows marked with "-----" to visually separate sections. The page_by feature automatically filters these divider rows to create clean output while preserving all associated data.

Example: Data with divider rows¶

# Create example data with divider rows
df = pl.DataFrame({
    "section": ["-----", "Age", "Age"],
    "item": ["Participant in Population", "    <60", "    >=60"],
    "value": [55, 25, 30],
})

df

When using page_by on data containing "-----" divider rows, rtflite automatically:

doc_divider = rtf.RTFDocument(
    df=df,
    rtf_body=rtf.RTFBody(
        page_by="section",
        col_rel_width=[1, 1],
        text_justification=["l", "l", "c"],
        border_top = ["single", "", ""],
        border_bottom = ["single", "", ""]
    ),
)

# Generate the RTF file
doc_divider.write_rtf("advanced-group-by-divider-filtering.rtf")