
Reimplement table column and row removal to be output format agnostic. Change-Id: I4822d53d37fd4604bf45c4bc4a315c8fc904376a Signed-off-by: Ron Stone <ronald.stone@windriver.com>
477 lines
17 KiB
Python
477 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to remove columns from reStructuredText grid tables based on column names.
|
|
Only processes grid tables that are declared using the .. table:: directive.
|
|
Column names to remove are specified using field lists in the RST file itself.
|
|
|
|
Usage:
|
|
python remove_rst_columns.py input.rst -o output.rst
|
|
python remove_rst_columns.py input.rst --output output.rst
|
|
|
|
If no output file is specified, the result is printed to stdout.
|
|
|
|
The RST file should contain field list entries specifying columns to remove:
|
|
:remove-column-from-html-table: Column1, Column2, Column3
|
|
|
|
Note: Only grid tables declared with the .. table:: directive will be modified.
|
|
Standalone grid tables without the directive will be left unchanged.
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import re
|
|
import os
|
|
from typing import List, Tuple, Optional
|
|
|
|
|
|
def parse_grid_table(lines: List[str], start_idx: int) -> Tuple[List[List[List[str]]], List[int], int]:
|
|
"""
|
|
Parse a reStructuredText grid table starting at the given line index.
|
|
|
|
Returns:
|
|
- table_data: List of rows, where each row is a list of cells,
|
|
and each cell is a list of lines (to preserve multi-line content)
|
|
- col_widths: List of column widths
|
|
- end_idx: Index of the line after the table
|
|
"""
|
|
# Find the table boundaries
|
|
table_lines = []
|
|
i = start_idx
|
|
|
|
# Skip to first border line
|
|
while i < len(lines) and not lines[i].strip().startswith('+'):
|
|
i += 1
|
|
|
|
if i >= len(lines):
|
|
return [], [], start_idx
|
|
|
|
# Collect all table lines
|
|
while i < len(lines) and (lines[i].strip().startswith('+') or lines[i].strip().startswith('|')):
|
|
table_lines.append(lines[i])
|
|
i += 1
|
|
|
|
if not table_lines:
|
|
return [], [], start_idx
|
|
|
|
# Parse column positions from the first border line
|
|
border_line = table_lines[0]
|
|
col_positions = []
|
|
for match in re.finditer(r'\+', border_line):
|
|
col_positions.append(match.start())
|
|
|
|
if len(col_positions) < 2:
|
|
return [], [], i
|
|
|
|
# Calculate column widths
|
|
col_widths = []
|
|
for j in range(len(col_positions) - 1):
|
|
col_widths.append(col_positions[j + 1] - col_positions[j] - 1)
|
|
|
|
# Parse table data
|
|
table_data = []
|
|
current_row = None
|
|
|
|
for line in table_lines:
|
|
if line.strip().startswith('+'):
|
|
# Border line - if we have a current row, add it to table_data
|
|
if current_row is not None:
|
|
table_data.append(current_row)
|
|
current_row = None
|
|
elif line.strip().startswith('|'):
|
|
# Data line
|
|
if current_row is None:
|
|
current_row = [[] for _ in range(len(col_positions) - 1)]
|
|
|
|
# Extract cell contents
|
|
for j in range(len(col_positions) - 1):
|
|
start_pos = col_positions[j] + 1
|
|
end_pos = col_positions[j + 1]
|
|
cell_content = line[start_pos:end_pos].rstrip() # Only strip right whitespace
|
|
|
|
# Add this line to the cell (preserving empty lines and indentation)
|
|
current_row[j].append(cell_content)
|
|
|
|
# Add the last row if it exists
|
|
if current_row is not None:
|
|
table_data.append(current_row)
|
|
|
|
return table_data, col_widths, i
|
|
|
|
|
|
def find_column_removal_directives(lines: List[str]) -> List[str]:
|
|
"""
|
|
Find field list entries that specify columns to remove.
|
|
Looks for entries like: :remove-column-from-html-table: Column1, Column2
|
|
|
|
Returns a list of column names to remove.
|
|
"""
|
|
columns_to_remove = []
|
|
|
|
for line in lines:
|
|
stripped_line = line.strip()
|
|
|
|
# Look for the field list entry
|
|
if stripped_line.startswith(':remove-column-from-html-table:'):
|
|
# Extract the column names after the colon
|
|
field_content = stripped_line[len(':remove-column-from-html-table:'):].strip()
|
|
|
|
if field_content:
|
|
# Split by comma and clean up each column name
|
|
column_names = [name.strip() for name in field_content.split(',')]
|
|
columns_to_remove.extend([name for name in column_names if name])
|
|
|
|
return columns_to_remove
|
|
|
|
|
|
def check_docs_build_context(lines: List[str]) -> None:
|
|
"""
|
|
Check if the docs-build-context directive matches the DOCS_BUILD_CONTEXT environment variable.
|
|
Exit the script if they don't match.
|
|
"""
|
|
docs_context = None
|
|
|
|
for line in lines:
|
|
stripped_line = line.strip()
|
|
if stripped_line.startswith(':docs-build-context:'):
|
|
field_content = stripped_line[len(':docs-build-context:'):].strip()
|
|
if field_content:
|
|
docs_context = field_content
|
|
break
|
|
|
|
if docs_context is not None:
|
|
env_context = os.environ.get('DOCS_BUILD_CONTEXT')
|
|
if env_context != docs_context:
|
|
print(f"Docs-build-context '{docs_context}' does not match DOCS_BUILD_CONTEXT environment variable '{env_context}'. Skipping.", file=sys.stderr)
|
|
sys.exit(0)
|
|
print(f"docs-build-context '{docs_context}' matches environment variable", file=sys.stderr)
|
|
|
|
|
|
def should_remove_emptied_rows(lines: List[str]) -> bool:
|
|
"""
|
|
Check if the remove-column-emptied-row directive is set to 1.
|
|
"""
|
|
for line in lines:
|
|
stripped_line = line.strip()
|
|
if stripped_line.startswith(':remove-column-emptied-row:'):
|
|
field_content = stripped_line[len(':remove-column-emptied-row:'):].strip()
|
|
return field_content == '1'
|
|
return False
|
|
|
|
|
|
def find_first_table_position(lines: List[str]) -> int:
|
|
"""
|
|
Find the position of the first table in the document.
|
|
Returns the line number of the first table, or len(lines) if no table found.
|
|
"""
|
|
for i, line in enumerate(lines):
|
|
if line.strip().startswith('+') and '-' in line:
|
|
# Check if this is actually a table by looking for the directive
|
|
is_table, _, _, _ = find_table_directive(lines, i)
|
|
if is_table:
|
|
return i
|
|
return len(lines)
|
|
|
|
|
|
def find_column_indices(headers: List[List[str]], columns_to_remove: List[str]) -> List[int]:
|
|
"""Find the indices of columns to remove based on header names."""
|
|
indices = []
|
|
|
|
# Convert headers (which are lists of lines) to single strings for comparison
|
|
header_strings = []
|
|
for header_cell in headers:
|
|
# Join all lines in the cell and strip whitespace
|
|
header_text = '\n'.join(header_cell).strip()
|
|
header_strings.append(header_text)
|
|
|
|
for col_name in columns_to_remove:
|
|
# Try exact match first
|
|
if col_name in header_strings:
|
|
indices.append(header_strings.index(col_name))
|
|
else:
|
|
# Try case-insensitive match
|
|
for i, header in enumerate(header_strings):
|
|
if header.lower() == col_name.lower():
|
|
indices.append(i)
|
|
break
|
|
else:
|
|
print(f"Warning: Column '{col_name}' not found in table headers", file=sys.stderr)
|
|
|
|
return sorted(set(indices), reverse=True) # Remove duplicates and sort in reverse order
|
|
|
|
|
|
def remove_columns_and_empty_rows(table_data: List[List[List[str]]], col_indices: List[int], remove_empty_rows: bool = False) -> List[List[List[str]]]:
|
|
"""Remove specified columns from table data and optionally remove rows that become empty."""
|
|
new_table = []
|
|
for row in table_data:
|
|
new_row = row.copy()
|
|
for idx in col_indices:
|
|
if idx < len(new_row):
|
|
new_row.pop(idx)
|
|
|
|
# If remove_empty_rows is True, check if the row is now empty (all cells are empty or whitespace)
|
|
if remove_empty_rows:
|
|
row_is_empty = True
|
|
for cell in new_row:
|
|
cell_content = '\n'.join(cell).strip()
|
|
if cell_content:
|
|
row_is_empty = False
|
|
break
|
|
if row_is_empty:
|
|
continue # Skip this row as it's now empty
|
|
|
|
new_table.append(new_row)
|
|
return new_table
|
|
|
|
|
|
def calculate_column_widths(table_data: List[List[List[str]]]) -> List[int]:
|
|
"""Calculate the minimum width needed for each column, considering multi-line content."""
|
|
if not table_data:
|
|
return []
|
|
|
|
num_cols = len(table_data[0])
|
|
col_widths = [0] * num_cols
|
|
|
|
for row in table_data:
|
|
for i, cell in enumerate(row):
|
|
if i < len(col_widths):
|
|
# Find the maximum line width in this cell
|
|
max_line_width = 0
|
|
for line in cell:
|
|
max_line_width = max(max_line_width, len(line.rstrip()))
|
|
col_widths[i] = max(col_widths[i], max_line_width)
|
|
|
|
return col_widths
|
|
|
|
|
|
def generate_border_line(col_widths: List[int], indent: str = '') -> str:
|
|
"""Generate a border line for the table with proper indentation."""
|
|
parts = ['+']
|
|
for width in col_widths:
|
|
parts.append('-' * width + '+')
|
|
return indent + ''.join(parts)
|
|
|
|
|
|
def generate_data_lines(row: List[List[str]], col_widths: List[int], indent: str = '') -> List[str]:
|
|
"""Generate data lines for a table row, handling multi-line cells with proper indentation."""
|
|
# Find the maximum number of lines in any cell of this row
|
|
max_lines = max(len(cell) for cell in row) if row else 0
|
|
|
|
data_lines = []
|
|
|
|
for line_idx in range(max_lines):
|
|
parts = ['|']
|
|
for col_idx, cell in enumerate(row):
|
|
if col_idx < len(col_widths):
|
|
# Get the content for this line of the cell, or empty string if no more lines
|
|
if line_idx < len(cell):
|
|
cell_content = cell[line_idx].rstrip()
|
|
else:
|
|
cell_content = ''
|
|
|
|
# Pad the cell content to the column width
|
|
padded_cell = cell_content.ljust(col_widths[col_idx])
|
|
parts.append(padded_cell + '|')
|
|
|
|
data_lines.append(indent + ''.join(parts))
|
|
|
|
return data_lines
|
|
|
|
|
|
def rebuild_table(table_data: List[List[List[str]]], indent: str = '') -> List[str]:
|
|
"""Rebuild the grid table as a list of lines with proper indentation and preserved formatting."""
|
|
if not table_data:
|
|
return []
|
|
|
|
col_widths = calculate_column_widths(table_data)
|
|
border_line = generate_border_line(col_widths, indent)
|
|
|
|
lines = [border_line]
|
|
|
|
for i, row in enumerate(table_data):
|
|
data_lines = generate_data_lines(row, col_widths, indent)
|
|
lines.extend(data_lines)
|
|
lines.append(border_line)
|
|
|
|
return lines
|
|
|
|
|
|
def find_table_directive(lines: List[str], start_idx: int) -> Tuple[bool, int, Optional[str], str]:
|
|
"""
|
|
Check if there's a .. table:: directive before the given grid table.
|
|
The directive must be the first non-empty line when looking backwards from the table.
|
|
|
|
Returns:
|
|
- is_table_directive: True if this grid table is declared with .. table::
|
|
- directive_start: Index where the directive starts
|
|
- table_title: Optional title from the directive
|
|
- table_indent: The indentation string used for the table
|
|
"""
|
|
# Get the indentation of the table itself
|
|
table_line = lines[start_idx]
|
|
table_indent = ''
|
|
for char in table_line:
|
|
if char in ' \t':
|
|
table_indent += char
|
|
else:
|
|
break
|
|
|
|
# Look backwards from the grid table start to find the first non-empty line
|
|
i = start_idx - 1
|
|
|
|
# Skip empty lines immediately before the table
|
|
while i >= 0 and lines[i].strip() == '':
|
|
i -= 1
|
|
|
|
if i < 0:
|
|
return False, -1, None, ''
|
|
|
|
# The first non-empty line must be the .. table:: directive
|
|
line = lines[i]
|
|
stripped_line = line.strip()
|
|
|
|
if stripped_line.startswith('.. table::') or \
|
|
stripped_line.startswith(':header-rows') or \
|
|
stripped_line.startswith(':widths'):
|
|
# Found the directive as the first non-empty line
|
|
table_title = None
|
|
if len(stripped_line) > 10: # More than just ".. table::"
|
|
table_title = stripped_line[10:].strip()
|
|
return True, i, table_title, table_indent
|
|
|
|
# First non-empty line is not .. table:: directive
|
|
return False, -1, None, ''
|
|
|
|
|
|
def process_rst_file(content: str, columns_to_remove: List[str], remove_empty_rows: bool = False) -> str:
|
|
"""Process the entire RST content and remove specified columns from grid tables declared with .. table:: directive."""
|
|
lines = content.split('\n')
|
|
result_lines = []
|
|
i = 0
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check if this line might be the start of a grid table
|
|
if line.strip().startswith('+') and '-' in line:
|
|
# Check if this grid table is declared with .. table:: directive
|
|
is_directive_table, directive_start, table_title, table_indent = find_table_directive(lines, i)
|
|
|
|
if is_directive_table:
|
|
# Try to parse as a grid table
|
|
table_data, original_col_widths, end_idx = parse_grid_table(lines, i)
|
|
|
|
if table_data and len(table_data) > 0:
|
|
# Assume first row contains headers
|
|
headers = table_data[0]
|
|
col_indices = find_column_indices(headers, columns_to_remove)
|
|
|
|
# Add any lines we haven't processed yet up to the current position
|
|
while len(result_lines) < i:
|
|
result_lines.append(lines[len(result_lines)])
|
|
|
|
if col_indices:
|
|
# Remove specified columns and rebuild table with preserved indentation
|
|
new_table_data = remove_columns_and_empty_rows(table_data, col_indices, remove_empty_rows)
|
|
new_table_lines = rebuild_table(new_table_data, table_indent)
|
|
result_lines.extend(new_table_lines)
|
|
|
|
title_info = f" ('{table_title}')" if table_title else ""
|
|
removed_cols_info = ['\n'.join(headers[idx]).strip() for idx in sorted(col_indices)]
|
|
empty_rows_info = " and empty rows" if remove_empty_rows else ""
|
|
print(f"Processed table{title_info}: removed columns {removed_cols_info}{empty_rows_info}", file=sys.stderr)
|
|
else:
|
|
# No columns to remove, keep original table
|
|
result_lines.extend(lines[i:end_idx])
|
|
|
|
i = end_idx
|
|
else:
|
|
# Not a valid grid table, keep the line
|
|
result_lines.append(line)
|
|
i += 1
|
|
else:
|
|
# Grid table not declared with .. table:: directive, keep it unchanged
|
|
result_lines.append(line)
|
|
i += 1
|
|
else:
|
|
# Regular line, keep it
|
|
result_lines.append(line)
|
|
i += 1
|
|
|
|
return '\n'.join(result_lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Remove columns from reStructuredText grid tables declared with .. table:: directive',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
%(prog)s input.rst -o output.rst
|
|
%(prog)s input.rst --output output.rst
|
|
cat input.rst | %(prog)s - > output.rst
|
|
|
|
The RST file should contain field list entries specifying columns to remove:
|
|
:remove-column-from-html-table: Column1, Column2, Column3
|
|
|
|
Note: Only processes grid tables declared with the .. table:: directive.
|
|
Standalone grid tables will be left unchanged.
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('input',
|
|
help='Input RST file (use "-" for stdin)')
|
|
parser.add_argument('-o', '--output',
|
|
help='Output file (default: stdout)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Read input
|
|
if args.input == '-':
|
|
content = sys.stdin.read()
|
|
else:
|
|
try:
|
|
with open(args.input, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except IOError as e:
|
|
print(f"Error reading input file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Find columns to remove from the content itself
|
|
lines = content.split('\n')
|
|
|
|
# Check docs-build-context against environment variable
|
|
check_docs_build_context(lines)
|
|
|
|
# Find columns to remove and whether to remove emptied rows
|
|
columns_to_remove = find_column_removal_directives(lines)
|
|
remove_empty_rows = should_remove_emptied_rows(lines)
|
|
|
|
if not columns_to_remove:
|
|
print("No column removal directives found in the file.", file=sys.stderr)
|
|
print("Looking for: :remove-column-from-html-table: Column1, Column2", file=sys.stderr)
|
|
|
|
if remove_empty_rows:
|
|
print("Will remove rows that become empty after column removal.", file=sys.stderr)
|
|
|
|
# Process content
|
|
try:
|
|
result = process_rst_file(content, columns_to_remove, remove_empty_rows)
|
|
except Exception as e:
|
|
print(f"Error processing file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Write output
|
|
if args.output:
|
|
try:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
f.write(result)
|
|
except IOError as e:
|
|
print(f"Error writing output file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
else:
|
|
print(result, end='')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|