#!/usr/bin/env python3 """ Script to remove columns from reStructuredText grid tables based on column names. Only processes grid tables that are declared using the .. table:: directive. Column names to remove are specified using field lists in the RST file itself. Usage: python remove_rst_columns.py input.rst -o output.rst python remove_rst_columns.py input.rst --output output.rst If no output file is specified, the result is printed to stdout. The RST file should contain field list entries specifying columns to remove: :remove-column-from-html-table: Column1, Column2, Column3 Note: Only grid tables declared with the .. table:: directive will be modified. Standalone grid tables without the directive will be left unchanged. """ import argparse import sys import re import os from typing import List, Tuple, Optional def parse_grid_table(lines: List[str], start_idx: int) -> Tuple[List[List[List[str]]], List[int], int]: """ Parse a reStructuredText grid table starting at the given line index. Returns: - table_data: List of rows, where each row is a list of cells, and each cell is a list of lines (to preserve multi-line content) - col_widths: List of column widths - end_idx: Index of the line after the table """ # Find the table boundaries table_lines = [] i = start_idx # Skip to first border line while i < len(lines) and not lines[i].strip().startswith('+'): i += 1 if i >= len(lines): return [], [], start_idx # Collect all table lines while i < len(lines) and (lines[i].strip().startswith('+') or lines[i].strip().startswith('|')): table_lines.append(lines[i]) i += 1 if not table_lines: return [], [], start_idx # Parse column positions from the first border line border_line = table_lines[0] col_positions = [] for match in re.finditer(r'\+', border_line): col_positions.append(match.start()) if len(col_positions) < 2: return [], [], i # Calculate column widths col_widths = [] for j in range(len(col_positions) - 1): col_widths.append(col_positions[j + 1] - col_positions[j] - 1) # Parse table data table_data = [] current_row = None for line in table_lines: if line.strip().startswith('+'): # Border line - if we have a current row, add it to table_data if current_row is not None: table_data.append(current_row) current_row = None elif line.strip().startswith('|'): # Data line if current_row is None: current_row = [[] for _ in range(len(col_positions) - 1)] # Extract cell contents for j in range(len(col_positions) - 1): start_pos = col_positions[j] + 1 end_pos = col_positions[j + 1] cell_content = line[start_pos:end_pos].rstrip() # Only strip right whitespace # Add this line to the cell (preserving empty lines and indentation) current_row[j].append(cell_content) # Add the last row if it exists if current_row is not None: table_data.append(current_row) return table_data, col_widths, i def find_column_removal_directives(lines: List[str]) -> List[str]: """ Find field list entries that specify columns to remove. Looks for entries like: :remove-column-from-html-table: Column1, Column2 Returns a list of column names to remove. """ columns_to_remove = [] for line in lines: stripped_line = line.strip() # Look for the field list entry if stripped_line.startswith(':remove-column-from-html-table:'): # Extract the column names after the colon field_content = stripped_line[len(':remove-column-from-html-table:'):].strip() if field_content: # Split by comma and clean up each column name column_names = [name.strip() for name in field_content.split(',')] columns_to_remove.extend([name for name in column_names if name]) return columns_to_remove def check_docs_build_context(lines: List[str]) -> None: """ Check if the docs-build-context directive matches the DOCS_BUILD_CONTEXT environment variable. Exit the script if they don't match. """ docs_context = None for line in lines: stripped_line = line.strip() if stripped_line.startswith(':docs-build-context:'): field_content = stripped_line[len(':docs-build-context:'):].strip() if field_content: docs_context = field_content break if docs_context is not None: env_context = os.environ.get('DOCS_BUILD_CONTEXT') if env_context != docs_context: print(f"Docs-build-context '{docs_context}' does not match DOCS_BUILD_CONTEXT environment variable '{env_context}'. Skipping.", file=sys.stderr) sys.exit(0) print(f"docs-build-context '{docs_context}' matches environment variable", file=sys.stderr) def should_remove_emptied_rows(lines: List[str]) -> bool: """ Check if the remove-column-emptied-row directive is set to 1. """ for line in lines: stripped_line = line.strip() if stripped_line.startswith(':remove-column-emptied-row:'): field_content = stripped_line[len(':remove-column-emptied-row:'):].strip() return field_content == '1' return False def find_first_table_position(lines: List[str]) -> int: """ Find the position of the first table in the document. Returns the line number of the first table, or len(lines) if no table found. """ for i, line in enumerate(lines): if line.strip().startswith('+') and '-' in line: # Check if this is actually a table by looking for the directive is_table, _, _, _ = find_table_directive(lines, i) if is_table: return i return len(lines) def find_column_indices(headers: List[List[str]], columns_to_remove: List[str]) -> List[int]: """Find the indices of columns to remove based on header names.""" indices = [] # Convert headers (which are lists of lines) to single strings for comparison header_strings = [] for header_cell in headers: # Join all lines in the cell and strip whitespace header_text = '\n'.join(header_cell).strip() header_strings.append(header_text) for col_name in columns_to_remove: # Try exact match first if col_name in header_strings: indices.append(header_strings.index(col_name)) else: # Try case-insensitive match for i, header in enumerate(header_strings): if header.lower() == col_name.lower(): indices.append(i) break else: print(f"Warning: Column '{col_name}' not found in table headers", file=sys.stderr) return sorted(set(indices), reverse=True) # Remove duplicates and sort in reverse order def remove_columns_and_empty_rows(table_data: List[List[List[str]]], col_indices: List[int], remove_empty_rows: bool = False) -> List[List[List[str]]]: """Remove specified columns from table data and optionally remove rows that become empty.""" new_table = [] for row in table_data: new_row = row.copy() for idx in col_indices: if idx < len(new_row): new_row.pop(idx) # If remove_empty_rows is True, check if the row is now empty (all cells are empty or whitespace) if remove_empty_rows: row_is_empty = True for cell in new_row: cell_content = '\n'.join(cell).strip() if cell_content: row_is_empty = False break if row_is_empty: continue # Skip this row as it's now empty new_table.append(new_row) return new_table def calculate_column_widths(table_data: List[List[List[str]]]) -> List[int]: """Calculate the minimum width needed for each column, considering multi-line content.""" if not table_data: return [] num_cols = len(table_data[0]) col_widths = [0] * num_cols for row in table_data: for i, cell in enumerate(row): if i < len(col_widths): # Find the maximum line width in this cell max_line_width = 0 for line in cell: max_line_width = max(max_line_width, len(line.rstrip())) col_widths[i] = max(col_widths[i], max_line_width) return col_widths def generate_border_line(col_widths: List[int], indent: str = '') -> str: """Generate a border line for the table with proper indentation.""" parts = ['+'] for width in col_widths: parts.append('-' * width + '+') return indent + ''.join(parts) def generate_data_lines(row: List[List[str]], col_widths: List[int], indent: str = '') -> List[str]: """Generate data lines for a table row, handling multi-line cells with proper indentation.""" # Find the maximum number of lines in any cell of this row max_lines = max(len(cell) for cell in row) if row else 0 data_lines = [] for line_idx in range(max_lines): parts = ['|'] for col_idx, cell in enumerate(row): if col_idx < len(col_widths): # Get the content for this line of the cell, or empty string if no more lines if line_idx < len(cell): cell_content = cell[line_idx].rstrip() else: cell_content = '' # Pad the cell content to the column width padded_cell = cell_content.ljust(col_widths[col_idx]) parts.append(padded_cell + '|') data_lines.append(indent + ''.join(parts)) return data_lines def rebuild_table(table_data: List[List[List[str]]], indent: str = '') -> List[str]: """Rebuild the grid table as a list of lines with proper indentation and preserved formatting.""" if not table_data: return [] col_widths = calculate_column_widths(table_data) border_line = generate_border_line(col_widths, indent) lines = [border_line] for i, row in enumerate(table_data): data_lines = generate_data_lines(row, col_widths, indent) lines.extend(data_lines) lines.append(border_line) return lines def find_table_directive(lines: List[str], start_idx: int) -> Tuple[bool, int, Optional[str], str]: """ Check if there's a .. table:: directive before the given grid table. The directive must be the first non-empty line when looking backwards from the table. Returns: - is_table_directive: True if this grid table is declared with .. table:: - directive_start: Index where the directive starts - table_title: Optional title from the directive - table_indent: The indentation string used for the table """ # Get the indentation of the table itself table_line = lines[start_idx] table_indent = '' for char in table_line: if char in ' \t': table_indent += char else: break # Look backwards from the grid table start to find the first non-empty line i = start_idx - 1 # Skip empty lines immediately before the table while i >= 0 and lines[i].strip() == '': i -= 1 if i < 0: return False, -1, None, '' # The first non-empty line must be the .. table:: directive line = lines[i] stripped_line = line.strip() if stripped_line.startswith('.. table::') or \ stripped_line.startswith(':header-rows') or \ stripped_line.startswith(':widths'): # Found the directive as the first non-empty line table_title = None if len(stripped_line) > 10: # More than just ".. table::" table_title = stripped_line[10:].strip() return True, i, table_title, table_indent # First non-empty line is not .. table:: directive return False, -1, None, '' def process_rst_file(content: str, columns_to_remove: List[str], remove_empty_rows: bool = False) -> str: """Process the entire RST content and remove specified columns from grid tables declared with .. table:: directive.""" lines = content.split('\n') result_lines = [] i = 0 while i < len(lines): line = lines[i] # Check if this line might be the start of a grid table if line.strip().startswith('+') and '-' in line: # Check if this grid table is declared with .. table:: directive is_directive_table, directive_start, table_title, table_indent = find_table_directive(lines, i) if is_directive_table: # Try to parse as a grid table table_data, original_col_widths, end_idx = parse_grid_table(lines, i) if table_data and len(table_data) > 0: # Assume first row contains headers headers = table_data[0] col_indices = find_column_indices(headers, columns_to_remove) # Add any lines we haven't processed yet up to the current position while len(result_lines) < i: result_lines.append(lines[len(result_lines)]) if col_indices: # Remove specified columns and rebuild table with preserved indentation new_table_data = remove_columns_and_empty_rows(table_data, col_indices, remove_empty_rows) new_table_lines = rebuild_table(new_table_data, table_indent) result_lines.extend(new_table_lines) title_info = f" ('{table_title}')" if table_title else "" removed_cols_info = ['\n'.join(headers[idx]).strip() for idx in sorted(col_indices)] empty_rows_info = " and empty rows" if remove_empty_rows else "" print(f"Processed table{title_info}: removed columns {removed_cols_info}{empty_rows_info}", file=sys.stderr) else: # No columns to remove, keep original table result_lines.extend(lines[i:end_idx]) i = end_idx else: # Not a valid grid table, keep the line result_lines.append(line) i += 1 else: # Grid table not declared with .. table:: directive, keep it unchanged result_lines.append(line) i += 1 else: # Regular line, keep it result_lines.append(line) i += 1 return '\n'.join(result_lines) def main(): parser = argparse.ArgumentParser( description='Remove columns from reStructuredText grid tables declared with .. table:: directive', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s input.rst -o output.rst %(prog)s input.rst --output output.rst cat input.rst | %(prog)s - > output.rst The RST file should contain field list entries specifying columns to remove: :remove-column-from-html-table: Column1, Column2, Column3 Note: Only processes grid tables declared with the .. table:: directive. Standalone grid tables will be left unchanged. """ ) parser.add_argument('input', help='Input RST file (use "-" for stdin)') parser.add_argument('-o', '--output', help='Output file (default: stdout)') args = parser.parse_args() # Read input if args.input == '-': content = sys.stdin.read() else: try: with open(args.input, 'r', encoding='utf-8') as f: content = f.read() except IOError as e: print(f"Error reading input file: {e}", file=sys.stderr) sys.exit(1) # Find columns to remove from the content itself lines = content.split('\n') # Check docs-build-context against environment variable check_docs_build_context(lines) # Find columns to remove and whether to remove emptied rows columns_to_remove = find_column_removal_directives(lines) remove_empty_rows = should_remove_emptied_rows(lines) if not columns_to_remove: print("No column removal directives found in the file.", file=sys.stderr) print("Looking for: :remove-column-from-html-table: Column1, Column2", file=sys.stderr) if remove_empty_rows: print("Will remove rows that become empty after column removal.", file=sys.stderr) # Process content try: result = process_rst_file(content, columns_to_remove, remove_empty_rows) except Exception as e: print(f"Error processing file: {e}", file=sys.stderr) sys.exit(1) # Write output if args.output: try: with open(args.output, 'w', encoding='utf-8') as f: f.write(result) except IOError as e: print(f"Error writing output file: {e}", file=sys.stderr) sys.exit(1) else: print(result, end='') if __name__ == '__main__': main()