docs/remove-grid-columns.py

#!/usr/bin/env python3
"""
Script to remove columns from reStructuredText grid tables based on column names.
Only processes grid tables that are declared using the .. table:: directive.
Column names to remove are specified using field lists in the RST file itself.

Usage:
  python remove_rst_columns.py input.rst -o output.rst
  python remove_rst_columns.py input.rst --output output.rst

If no output file is specified, the result is printed to stdout.

The RST file should contain field list entries specifying columns to remove:
  :remove-column-from-html-table: Column1, Column2, Column3

Note: Only grid tables declared with the .. table:: directive will be modified.
Standalone grid tables without the directive will be left unchanged.
"""

import argparse
import sys
import re
import os
from typing import List, Tuple, Optional


def parse_grid_table(lines: List[str], start_idx: int) -> Tuple[List[List[List[str]]], List[int], int]:
    """
    Parse a reStructuredText grid table starting at the given line index.

    Returns:
        - table_data: List of rows, where each row is a list of cells,
          and each cell is a list of lines (to preserve multi-line content)
        - col_widths: List of column widths
        - end_idx: Index of the line after the table
    """
    # Find the table boundaries
    table_lines = []
    i = start_idx

    # Skip to first border line
    while i < len(lines) and not lines[i].strip().startswith('+'):
        i += 1

    if i >= len(lines):
        return [], [], start_idx

    # Collect all table lines
    while i < len(lines) and (lines[i].strip().startswith('+') or lines[i].strip().startswith('|')):
        table_lines.append(lines[i])
        i += 1

    if not table_lines:
        return [], [], start_idx

    # Parse column positions from the first border line
    border_line = table_lines[0]
    col_positions = []
    for match in re.finditer(r'\+', border_line):
        col_positions.append(match.start())

    if len(col_positions) < 2:
        return [], [], i

    # Calculate column widths
    col_widths = []
    for j in range(len(col_positions) - 1):
        col_widths.append(col_positions[j + 1] - col_positions[j] - 1)

    # Parse table data
    table_data = []
    current_row = None

    for line in table_lines:
        if line.strip().startswith('+'):
            # Border line - if we have a current row, add it to table_data
            if current_row is not None:
                table_data.append(current_row)
                current_row = None
        elif line.strip().startswith('|'):
            # Data line
            if current_row is None:
                current_row = [[] for _ in range(len(col_positions) - 1)]

            # Extract cell contents
            for j in range(len(col_positions) - 1):
                start_pos = col_positions[j] + 1
                end_pos = col_positions[j + 1]
                cell_content = line[start_pos:end_pos].rstrip()  # Only strip right whitespace

                # Add this line to the cell (preserving empty lines and indentation)
                current_row[j].append(cell_content)

    # Add the last row if it exists
    if current_row is not None:
        table_data.append(current_row)

    return table_data, col_widths, i


def find_column_removal_directives(lines: List[str]) -> List[str]:
    """
    Find field list entries that specify columns to remove.
    Looks for entries like: :remove-column-from-html-table: Column1, Column2

    Returns a list of column names to remove.
    """
    columns_to_remove = []

    for line in lines:
        stripped_line = line.strip()

        # Look for the field list entry
        if stripped_line.startswith(':remove-column-from-html-table:'):
            # Extract the column names after the colon
            field_content = stripped_line[len(':remove-column-from-html-table:'):].strip()

            if field_content:
                # Split by comma and clean up each column name
                column_names = [name.strip() for name in field_content.split(',')]
                columns_to_remove.extend([name for name in column_names if name])

    return columns_to_remove


def check_docs_build_context(lines: List[str]) -> None:
    """
    Check if the docs-build-context directive matches the DOCS_BUILD_CONTEXT environment variable.
    Exit the script if they don't match.
    """
    docs_context = None

    for line in lines:
        stripped_line = line.strip()
        if stripped_line.startswith(':docs-build-context:'):
            field_content = stripped_line[len(':docs-build-context:'):].strip()
            if field_content:
                docs_context = field_content
                break

    if docs_context is not None:
        env_context = os.environ.get('DOCS_BUILD_CONTEXT')
        if env_context != docs_context:
            print(f"Docs-build-context '{docs_context}' does not match DOCS_BUILD_CONTEXT environment variable '{env_context}'. Skipping.", file=sys.stderr)
            sys.exit(0)
        print(f"docs-build-context '{docs_context}' matches environment variable", file=sys.stderr)


def should_remove_emptied_rows(lines: List[str]) -> bool:
    """
    Check if the remove-column-emptied-row directive is set to 1.
    """
    for line in lines:
        stripped_line = line.strip()
        if stripped_line.startswith(':remove-column-emptied-row:'):
            field_content = stripped_line[len(':remove-column-emptied-row:'):].strip()
            return field_content == '1'
    return False


def find_first_table_position(lines: List[str]) -> int:
    """
    Find the position of the first table in the document.
    Returns the line number of the first table, or len(lines) if no table found.
    """
    for i, line in enumerate(lines):
        if line.strip().startswith('+') and '-' in line:
            # Check if this is actually a table by looking for the directive
            is_table, _, _, _ = find_table_directive(lines, i)
            if is_table:
                return i
    return len(lines)


def find_column_indices(headers: List[List[str]], columns_to_remove: List[str]) -> List[int]:
    """Find the indices of columns to remove based on header names."""
    indices = []

    # Convert headers (which are lists of lines) to single strings for comparison
    header_strings = []
    for header_cell in headers:
        # Join all lines in the cell and strip whitespace
        header_text = '\n'.join(header_cell).strip()
        header_strings.append(header_text)

    for col_name in columns_to_remove:
        # Try exact match first
        if col_name in header_strings:
            indices.append(header_strings.index(col_name))
        else:
            # Try case-insensitive match
            for i, header in enumerate(header_strings):
                if header.lower() == col_name.lower():
                    indices.append(i)
                    break
            else:
                print(f"Warning: Column '{col_name}' not found in table headers", file=sys.stderr)

    return sorted(set(indices), reverse=True)  # Remove duplicates and sort in reverse order


def remove_columns_and_empty_rows(table_data: List[List[List[str]]], col_indices: List[int], remove_empty_rows: bool = False) -> List[List[List[str]]]:
    """Remove specified columns from table data and optionally remove rows that become empty."""
    new_table = []
    for row in table_data:
        new_row = row.copy()
        for idx in col_indices:
            if idx < len(new_row):
                new_row.pop(idx)

        # If remove_empty_rows is True, check if the row is now empty (all cells are empty or whitespace)
        if remove_empty_rows:
            row_is_empty = True
            for cell in new_row:
                cell_content = '\n'.join(cell).strip()
                if cell_content:
                    row_is_empty = False
                    break
            if row_is_empty:
                continue  # Skip this row as it's now empty

        new_table.append(new_row)
    return new_table


def calculate_column_widths(table_data: List[List[List[str]]]) -> List[int]:
    """Calculate the minimum width needed for each column, considering multi-line content."""
    if not table_data:
        return []

    num_cols = len(table_data[0])
    col_widths = [0] * num_cols

    for row in table_data:
        for i, cell in enumerate(row):
            if i < len(col_widths):
                # Find the maximum line width in this cell
                max_line_width = 0
                for line in cell:
                    max_line_width = max(max_line_width, len(line.rstrip()))
                col_widths[i] = max(col_widths[i], max_line_width)

    return col_widths


def generate_border_line(col_widths: List[int], indent: str = '') -> str:
    """Generate a border line for the table with proper indentation."""
    parts = ['+']
    for width in col_widths:
        parts.append('-' * width + '+')
    return indent + ''.join(parts)


def generate_data_lines(row: List[List[str]], col_widths: List[int], indent: str = '') -> List[str]:
    """Generate data lines for a table row, handling multi-line cells with proper indentation."""
    # Find the maximum number of lines in any cell of this row
    max_lines = max(len(cell) for cell in row) if row else 0

    data_lines = []

    for line_idx in range(max_lines):
        parts = ['|']
        for col_idx, cell in enumerate(row):
            if col_idx < len(col_widths):
                # Get the content for this line of the cell, or empty string if no more lines
                if line_idx < len(cell):
                    cell_content = cell[line_idx].rstrip()
                else:
                    cell_content = ''

                # Pad the cell content to the column width
                padded_cell = cell_content.ljust(col_widths[col_idx])
                parts.append(padded_cell + '|')

        data_lines.append(indent + ''.join(parts))

    return data_lines


def rebuild_table(table_data: List[List[List[str]]], indent: str = '') -> List[str]:
    """Rebuild the grid table as a list of lines with proper indentation and preserved formatting."""
    if not table_data:
        return []

    col_widths = calculate_column_widths(table_data)
    border_line = generate_border_line(col_widths, indent)

    lines = [border_line]

    for i, row in enumerate(table_data):
        data_lines = generate_data_lines(row, col_widths, indent)
        lines.extend(data_lines)
        lines.append(border_line)

    return lines


def find_table_directive(lines: List[str], start_idx: int) -> Tuple[bool, int, Optional[str], str]:
    """
    Check if there's a .. table:: directive before the given grid table.
    The directive must be the first non-empty line when looking backwards from the table.

    Returns:
        - is_table_directive: True if this grid table is declared with .. table::
        - directive_start: Index where the directive starts
        - table_title: Optional title from the directive
        - table_indent: The indentation string used for the table
    """
    # Get the indentation of the table itself
    table_line = lines[start_idx]
    table_indent = ''
    for char in table_line:
        if char in ' \t':
            table_indent += char
        else:
            break

    # Look backwards from the grid table start to find the first non-empty line
    i = start_idx - 1

    # Skip empty lines immediately before the table
    while i >= 0 and lines[i].strip() == '':
        i -= 1

    if i < 0:
        return False, -1, None, ''

    # The first non-empty line must be the .. table:: directive
    line = lines[i]
    stripped_line = line.strip()

    if stripped_line.startswith('.. table::') or \
       stripped_line.startswith(':header-rows') or \
       stripped_line.startswith(':widths'):
        # Found the directive as the first non-empty line
        table_title = None
        if len(stripped_line) > 10:  # More than just ".. table::"
            table_title = stripped_line[10:].strip()
        return True, i, table_title, table_indent

    # First non-empty line is not .. table:: directive
    return False, -1, None, ''


def process_rst_file(content: str, columns_to_remove: List[str], remove_empty_rows: bool = False) -> str:
    """Process the entire RST content and remove specified columns from grid tables declared with .. table:: directive."""
    lines = content.split('\n')
    result_lines = []
    i = 0

    while i < len(lines):
        line = lines[i]

        # Check if this line might be the start of a grid table
        if line.strip().startswith('+') and '-' in line:
            # Check if this grid table is declared with .. table:: directive
            is_directive_table, directive_start, table_title, table_indent = find_table_directive(lines, i)

            if is_directive_table:
                # Try to parse as a grid table
                table_data, original_col_widths, end_idx = parse_grid_table(lines, i)

                if table_data and len(table_data) > 0:
                    # Assume first row contains headers
                    headers = table_data[0]
                    col_indices = find_column_indices(headers, columns_to_remove)

                    # Add any lines we haven't processed yet up to the current position
                    while len(result_lines) < i:
                        result_lines.append(lines[len(result_lines)])

                    if col_indices:
                        # Remove specified columns and rebuild table with preserved indentation
                        new_table_data = remove_columns_and_empty_rows(table_data, col_indices, remove_empty_rows)
                        new_table_lines = rebuild_table(new_table_data, table_indent)
                        result_lines.extend(new_table_lines)

                        title_info = f" ('{table_title}')" if table_title else ""
                        removed_cols_info = ['\n'.join(headers[idx]).strip() for idx in sorted(col_indices)]
                        empty_rows_info = " and empty rows" if remove_empty_rows else ""
                        print(f"Processed table{title_info}: removed columns {removed_cols_info}{empty_rows_info}", file=sys.stderr)
                    else:
                        # No columns to remove, keep original table
                        result_lines.extend(lines[i:end_idx])

                    i = end_idx
                else:
                    # Not a valid grid table, keep the line
                    result_lines.append(line)
                    i += 1
            else:
                # Grid table not declared with .. table:: directive, keep it unchanged
                result_lines.append(line)
                i += 1
        else:
            # Regular line, keep it
            result_lines.append(line)
            i += 1

    return '\n'.join(result_lines)


def main():
    parser = argparse.ArgumentParser(
        description='Remove columns from reStructuredText grid tables declared with .. table:: directive',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s input.rst -o output.rst
  %(prog)s input.rst --output output.rst
  cat input.rst | %(prog)s - > output.rst

The RST file should contain field list entries specifying columns to remove:
  :remove-column-from-html-table: Column1, Column2, Column3

Note: Only processes grid tables declared with the .. table:: directive.
Standalone grid tables will be left unchanged.
        """
    )

    parser.add_argument('input',
                       help='Input RST file (use "-" for stdin)')
    parser.add_argument('-o', '--output',
                       help='Output file (default: stdout)')

    args = parser.parse_args()

    # Read input
    if args.input == '-':
        content = sys.stdin.read()
    else:
        try:
            with open(args.input, 'r', encoding='utf-8') as f:
                content = f.read()
        except IOError as e:
            print(f"Error reading input file: {e}", file=sys.stderr)
            sys.exit(1)

    # Find columns to remove from the content itself
    lines = content.split('\n')

    # Check docs-build-context against environment variable
    check_docs_build_context(lines)

    # Find columns to remove and whether to remove emptied rows
    columns_to_remove = find_column_removal_directives(lines)
    remove_empty_rows = should_remove_emptied_rows(lines)

    if not columns_to_remove:
        print("No column removal directives found in the file.", file=sys.stderr)
        print("Looking for: :remove-column-from-html-table: Column1, Column2", file=sys.stderr)

    if remove_empty_rows:
        print("Will remove rows that become empty after column removal.", file=sys.stderr)

    # Process content
    try:
        result = process_rst_file(content, columns_to_remove, remove_empty_rows)
    except Exception as e:
        print(f"Error processing file: {e}", file=sys.stderr)
        sys.exit(1)

    # Write output
    if args.output:
        try:
            with open(args.output, 'w', encoding='utf-8') as f:
                f.write(result)
        except IOError as e:
            print(f"Error writing output file: {e}", file=sys.stderr)
            sys.exit(1)
    else:
        print(result, end='')


if __name__ == '__main__':
    main()