Source code for confluence_content_parser.nodes

from __future__ import annotations

import html
from abc import ABC
from collections.abc import Iterator
from enum import Enum
from typing import Any, TypeVar, overload

from pydantic import BaseModel, Field

T1 = TypeVar("T1", bound="Node")
T2 = TypeVar("T2", bound="Node")
T3 = TypeVar("T3", bound="Node")
T4 = TypeVar("T4", bound="Node")
T5 = TypeVar("T5", bound="Node")


[docs] class Node(BaseModel, ABC): """Base class for all content nodes in the Confluence document tree.""" is_block_level: bool = False
[docs] def walk(self) -> Iterator[Node]: """Walk through this node and all its descendants.""" yield self for child in self.get_children(): yield from child.walk()
[docs] def get_children(self) -> list[Node]: """Get direct children of this node. Override in subclasses.""" return []
[docs] def to_text(self) -> str: """Get text representation of this node. Override in subclasses.""" return ""
@overload def find_all(self) -> list[Node]: ... @overload def find_all(self, node_type: type[T1]) -> list[T1]: ... @overload def find_all(self, t1: type[T1], t2: type[T2]) -> tuple[list[T1], list[T2]]: ... @overload def find_all(self, t1: type[T1], t2: type[T2], t3: type[T3]) -> tuple[list[T1], list[T2], list[T3]]: ... @overload def find_all( self, t1: type[T1], t2: type[T2], t3: type[T3], t4: type[T4] ) -> tuple[list[T1], list[T2], list[T3], list[T4]]: ... @overload def find_all( self, t1: type[T1], t2: type[T2], t3: type[T3], t4: type[T4], t5: type[T5] ) -> tuple[list[T1], list[T2], list[T3], list[T4], list[T5]]: ...
[docs] def find_all(self, *node_types) -> Any: # type: ignore[no-untyped-def,misc] """Find all nodes of specific type(s) in this subtree with modern variadic generics. Args: *node_types: Either no arguments (all nodes), a single node class, or multiple node classes. Returns: - No arguments: list[Node] (all nodes) - Single type: list[T] where T is the requested type - Multiple types: tuple of lists with proper typing for each type Examples: # All nodes all_nodes = node.find_all() # Single type - returns list[HeadingElement] headings = node.find_all(HeadingElement) # Multiple types - returns tuple with proper typing headings, panels = node.find_all(HeadingElement, PanelMacro) headings, panels, links = node.find_all(HeadingElement, PanelMacro, LinkElement) """ if len(node_types) == 0: return list(self.walk()) if len(node_types) == 1: node_type = node_types[0] results = [] for node in self.walk(): if isinstance(node, node_type): results.append(node) return results result_lists: list[list[Node]] = [[] for _ in node_types] for node in self.walk(): for i, node_type in enumerate(node_types): if isinstance(node, node_type): result_lists[i].append(node) return tuple(result_lists)
[docs] class ContainerElement(Node): """Base for container elements.""" children: list[Node] = Field(default_factory=list) styles: dict[str, str] = Field(default_factory=dict)
[docs] def get_children(self) -> list[Node]: return self.children
[docs] def to_text(self) -> str: parts = [] for child in self.children: child_text = child.to_text() if clean_child_text := child_text.strip(): parts.append(clean_child_text) if self._has_block_children(): return "\n\n".join(parts) else: return " ".join(parts)
def _has_block_children(self) -> bool: """Check if this container has block-level children that should be separated by newlines.""" return any(child.is_block_level for child in self.children)
[docs] class Fragment(ContainerElement): """Neutral container for multiple top-level nodes (non-rendering).""" pass
[docs] class LayoutSectionType(Enum): """Type of layout section.""" SINGLE = "single" FIXED_WIDTH = "fixed-width" TWO_EQUAL = "two_equal" TWO_LEFT_SIDEBAR = "two_left_sidebar" TWO_RIGHT_SIDEBAR = "two_right_sidebar" THREE_EQUAL = "three_equal" THREE_WITH_SIDEBARS = "three_with_sidebars" THREE_LEFT_SIDEBARS = "three_left_sidebars" THREE_RIGHT_SIDEBARS = "three_right_sidebars" FOUR_EQUAL = "four_equal" FIVE_EQUAL = "five_equal"
[docs] class LayoutElement(ContainerElement): """A page layout container containing sections.""" is_block_level: bool = True
[docs] class LayoutSection(ContainerElement): """A layout section (row) containing cells.""" section_type: LayoutSectionType breakout_mode: str | None = None breakout_width: str | None = None is_block_level: bool = True
[docs] class LayoutCell(ContainerElement): """A layout cell (column) containing content.""" is_block_level: bool = True
[docs] class HeadingType(Enum): """Type of heading element.""" H1 = "h1" H2 = "h2" H3 = "h3" H4 = "h4" H5 = "h5" H6 = "h6"
[docs] class HeadingElement(ContainerElement): """A heading element.""" type: HeadingType is_block_level: bool = True
[docs] class TextEffectType(Enum): """Type of inline element.""" STRONG = "strong" EMPHASIS = "em" UNDERLINE = "u" STRIKETHROUGH = "del" MONOSPACE = "code" SUBSCRIPT = "sub" SUPERSCRIPT = "sup" BLOCKQUOTE = "blockquote" SPAN = "span"
[docs] class TextEffectElement(ContainerElement): """Base for inline formatting elements like bold, italic, etc.""" type: TextEffectType
[docs] class TextBreakType(Enum): """Type of text break element.""" PARAGRAPH = "p" LINE_BREAK = "br" HORIZONTAL_RULE = "hr"
[docs] class TextBreakElement(ContainerElement): """A text break element.""" type: TextBreakType is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of text break elements.""" if self.type == TextBreakType.HORIZONTAL_RULE: return "---" elif self.type == TextBreakType.LINE_BREAK: return "\n" else: return super().to_text()
[docs] class ListType(Enum): """Type of list element.""" UNORDERED = "ul" ORDERED = "ol" TASK = "task-list"
[docs] class ListElement(ContainerElement): """A list element.""" type: ListType start: int | None = None is_block_level: bool = True
[docs] def to_text(self, indent_level: int = 0) -> str: """Convert list to text with appropriate markers and indentation.""" if not self.children: return "" return self._format_items(self.children, indent_level)
def _format_items(self, items: list[ListItem | ListElement | Node], indent_level: int) -> str: """Recursively format list items with proper indentation.""" parts = [] item_number = self.start or 1 indent = " " * indent_level for item in items: if isinstance(item, ListItem): content_parts = [] nested_lists = [] for child in item.children: if isinstance(child, ListElement): nested_lists.append(child) else: child_text = child.to_text().strip() if child_text: content_parts.append(child_text) content = " ".join(content_parts) if item.status is not None: marker = "✓" if item.status == TaskListItemStatus.COMPLETE else "○" parts.append(f"{indent}{marker} {content}") elif self.type == ListType.UNORDERED: parts.append(f"{indent}â€ĸ {content}") elif self.type == ListType.ORDERED: parts.append(f"{indent}{item_number}. {content}") item_number += 1 else: parts.append(f"{indent}{content}") for nested_list in nested_lists: nested_text = nested_list.to_text(indent_level + 1) if nested_text: parts.append(nested_text) elif isinstance(item, ListElement): nested_text = item.to_text(indent_level + 1) if nested_text: parts.append(nested_text) else: child_text = item.to_text().strip() if child_text: parts.append(f"{indent}{child_text}") return "\n".join(parts)
[docs] class TaskListItemStatus(Enum): """Type of task list item status.""" COMPLETE = "complete" INCOMPLETE = "incomplete"
[docs] class ListItem(ContainerElement): """A list item element that can be regular or task item.""" task_id: str | None = None uuid: str | None = None status: TaskListItemStatus | None = None is_block_level: bool = True
[docs] class LinkType(Enum): """Type of link element.""" EXTERNAL = "a" MAILTO = "mailto" SPACE = "ri:space" PAGE = "ri:page" BLOG_POST = "ri:blog-post" USER = "ri:user" ATTACHMENT = "ri:attachment" ANCHOR = "ac:anchor"
[docs] class LinkElement(ContainerElement): """A link element.""" type: LinkType href: str | None = None anchor: str | None = None space_key: str | None = None content_title: str | None = None posting_day: str | None = None version_at_save: str | None = None account_id: str | None = None filename: str | None = None
[docs] def to_text(self) -> str: """Extract text from rich content or href.""" if not self.children: return self.href or "" resource_parts = [] content_parts = [] for child in self.children: child_text = child.to_text().strip() if child_text: if hasattr(child, "type") and hasattr(child.type, "value"): resource_parts.append(child_text) else: content_parts.append(child_text) if resource_parts and content_parts: resource_text = " ".join(resource_parts) content_text = " ".join(content_parts) return f"{resource_text} {content_text}" elif resource_parts: return " ".join(resource_parts) elif content_parts: return " ".join(content_parts) else: return self.href or ""
[docs] class Image(ContainerElement): """An image element.""" src: str | None = None alt: str | None = None title: str | None = None width: str | None = None height: str | None = None alignment: str | None = None layout: str | None = None original_height: str | None = None original_width: str | None = None custom_width: bool | None = None filename: str | None = None version_at_save: str | None = None url_value: str | None = None
[docs] def to_text(self) -> str: """Generate text representation with caption if present.""" image_text = f"đŸ–ŧī¸ Image: {self.alt or self.src or self.filename or self.url_value or 'Unknown'}" if self.children: caption_text = "".join(child.to_text() for child in self.children) if caption_text.strip(): return f"{image_text}\nCaption: {caption_text.strip()}" return image_text
[docs] class Table(ContainerElement): """A table element with metadata and rows.""" width: str | None = None layout: str | None = None local_id: str | None = None display_mode: str | None = None
[docs] def to_text(self) -> str: """Generate text representation of table.""" if not self.children: return "" lines = [] for row in self.children: row_text = row.to_text() if clean_row_text := row_text.strip(): lines.append(clean_row_text) return "\n".join(lines)
[docs] class TableRow(ContainerElement): """A table row.""" is_block_level: bool = True
[docs] def to_text(self) -> str: """Format row as text with | separators.""" if not self.children: return "" cell_texts = [child.to_text() for child in self.children] return " | ".join(cell_texts)
[docs] class TableCell(ContainerElement): """A table cell.""" is_header: bool = False rowspan: int | None = None colspan: int | None = None
[docs] class Emoticon(Node): """An emoticon element.""" name: str emoji_shortname: str | None = None emoji_id: str | None = None emoji_fallback: str | None = None
[docs] def to_text(self) -> str: """Return the best text representation of the emoticon.""" if self.emoji_fallback: return self.emoji_fallback elif self.emoji_shortname: return self.emoji_shortname else: return f":{self.name}:"
[docs] class Time(Node): """A time element with datetime.""" datetime: str | None = None
[docs] def to_text(self) -> str: """Generate text representation of time.""" if self.datetime: return f"📅 {self.datetime}" else: return "📅 Date"
[docs] class ResourceIdentifierType(Enum): """Type of resource identifier.""" PAGE = "page" BLOG_POST = "blog-post" ATTACHMENT = "attachment" URL = "url" SHORTCUT = "shortcut" USER = "user" SPACE = "space" CONTENT_ENTITY = "content-entity"
[docs] class ResourceIdentifier(Node): """A resource identifier element.""" type: ResourceIdentifierType space_key: str | None = None content_title: str | None = None content_id: str | None = None posting_day: str | None = None filename: str | None = None value: str | None = None key: str | None = None parameter: str | None = None account_id: str | None = None local_id: str | None = None userkey: str | None = None version_at_save: str | None = None
[docs] def to_text(self) -> str: """Generate appropriate text representation based on type.""" if self.type == ResourceIdentifierType.PAGE: return "📄 Page" elif self.type == ResourceIdentifierType.BLOG_POST: return f"📝 Blog: {self.posting_day}" if self.posting_day else "📝 Blog" elif self.type == ResourceIdentifierType.ATTACHMENT: return f"📎 Attachment: {self.filename}" if self.filename else "📎 Attachment" elif self.type == ResourceIdentifierType.URL: return f"🔗 URL: {self.value}" if self.value else "🔗 URL" elif self.type == ResourceIdentifierType.USER: if self.account_id: return f"👤 User: {self.account_id}" elif self.userkey: return f"👤 User: {self.userkey}" else: return "👤 User" elif self.type == ResourceIdentifierType.SPACE: return f"🏠 Space: {self.space_key}" if self.space_key else "🏠 Space" elif self.type == ResourceIdentifierType.SHORTCUT: return f"🔗 Shortcut: {self.key}@{self.parameter}" if self.key and self.parameter else "🔗 Shortcut" elif self.type == ResourceIdentifierType.CONTENT_ENTITY: return f"📄 Content: {self.content_id}" if self.content_id else "📄 Content"
[docs] class PlaceholderElement(Node): """A placeholder element for content hints.""" text: str
[docs] def to_text(self) -> str: """Generate text representation of placeholder.""" return f"💭 Placeholder: {self.text}"
[docs] class PanelMacroType(Enum): """Type of panel macro based on visual presentation.""" PANEL = "panel" NOTE = "note" SUCCESS = "success" WARNING = "warning" ERROR = "error" INFO = "info"
[docs] class PanelMacro(ContainerElement): """A panel macro element with background color and optional icon.""" type: PanelMacroType bg_color: str | None = None panel_icon: str | None = None panel_icon_id: str | None = None panel_icon_text: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of panel with content.""" content = super().to_text() if self.type == PanelMacroType.NOTE: return f"📝 NOTE: {content}" if content else "📝 NOTE" elif self.type == PanelMacroType.SUCCESS: return f"✅ SUCCESS: {content}" if content else "✅ SUCCESS" elif self.type == PanelMacroType.WARNING: return f"âš ī¸ WARNING: {content}" if content else "âš ī¸ WARNING" elif self.type == PanelMacroType.ERROR: return f"❌ ERROR: {content}" if content else "❌ ERROR" elif self.type == PanelMacroType.INFO: return f"â„šī¸ INFO: {content}" if content else "â„šī¸ INFO" elif self.type == PanelMacroType.PANEL: if self.panel_icon_text: return f"{self.panel_icon_text} {content}" if content else self.panel_icon_text else: return f"📋 PANEL: {content}" if content else "📋 PANEL"
[docs] class CodeMacro(Node): """A code macro element with syntax highlighting.""" language: str | None = None breakout_mode: str | None = None breakout_width: str | None = None code: str is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of code block.""" if self.language: return f"```{self.language}\n{self.code}\n```" else: return f"```\n{self.code}\n```"
[docs] class ExpandMacro(ContainerElement): """An expand/collapse macro element.""" title: str | None = None breakout_width: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of expand macro.""" content = super().to_text() title = self.title or "Expand" return f"â–ļ {title}\n{content}" if content else f"â–ļ {title}"
[docs] class StatusMacro(Node): """A status macro element with title and color.""" title: str | None = None colour: str | None = None
[docs] def to_text(self) -> str: """Generate text representation of status.""" title = self.title or "Status" if self.colour: return f"đŸˇī¸ Status: {title} ({self.colour})" else: return f"đŸˇī¸ Status: {title}"
[docs] class TocMacro(Node): """A table of contents macro element.""" style: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of table of contents.""" return "📋 Table of Contents"
[docs] class JiraMacro(Node): """A JIRA issue macro element.""" key: str | None = None server_id: str | None = None server: str | None = None
[docs] def to_text(self) -> str: """Generate text representation of JIRA issue.""" if self.key: if self.server and self.server != "System Jira": return f"đŸŽĢ {self.key} ({self.server})" else: return f"đŸŽĢ {self.key}" else: return "đŸŽĢ JIRA Issue"
[docs] class IncludeMacro(ContainerElement): """An include macro element for including other pages.""" space_key: str | None = None content_title: str | None = None version_at_save: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of include macro.""" if self.content_title: return f"📄 Include: {self.content_title}" else: return "📄 Include Page"
[docs] class TasksReportMacro(Node): """A tasks report macro element.""" spaces: str | None = None is_missing_required_parameters: bool = False is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of tasks report.""" if self.spaces: return f"📊 Tasks Report: {self.spaces}" else: return "📊 Tasks Report"
[docs] class ExcerptIncludeMacro(ContainerElement): """An excerpt include macro element.""" space_key: str | None = None content_title: str | None = None posting_day: str | None = None version_at_save: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of excerpt include.""" if self.content_title: if self.posting_day: return f"📝 Excerpt: {self.content_title} ({self.posting_day})" else: return f"📝 Excerpt: {self.content_title}" else: return "📝 Excerpt Include"
[docs] class AttachmentsMacro(Node): """An attachments macro element for listing page attachments.""" is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of attachments macro.""" return "📎 Attachments"
[docs] class ViewPdfMacro(Node): """A view PDF macro element.""" filename: str | None = None version_at_save: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of PDF viewer.""" if self.filename: return f"📄 PDF: {self.filename}" else: return "📄 PDF Viewer"
[docs] class ViewFileMacro(Node): """A view file macro element for displaying files inline.""" filename: str | None = None version_at_save: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of file viewer.""" if self.filename: return f"📁 File: {self.filename}" else: return "📁 File Viewer"
[docs] class ProfileMacro(ContainerElement): """A profile macro element for displaying user profiles.""" account_id: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of profile macro.""" if self.account_id: return f"👤 Profile: {self.account_id}" else: return "👤 User Profile"
[docs] class AnchorMacro(Node): """An anchor macro element for creating page anchors.""" anchor_name: str | None = None
[docs] def to_text(self) -> str: """Generate text representation of anchor.""" if self.anchor_name: return f"⚓ Anchor: {self.anchor_name}" else: return "⚓ Anchor"
[docs] class ExcerptMacro(ContainerElement): """An excerpt macro element for marking excerptable content.""" is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of excerpt.""" content = super().to_text() return f"📄 Excerpt: {content}" if content else "📄 Excerpt"
[docs] class DecisionListItemState(Enum): """State of decision list item.""" DECIDED = "DECIDED" PENDING = "PENDING"
[docs] class DecisionList(ContainerElement): """A decision list element containing decision items.""" local_id: str | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of decision list.""" if not self.children: return "📋 Decision List" parts = [] for child in self.children: child_text = child.to_text().strip() if child_text: parts.append(child_text) return "\n".join(parts) if parts else "📋 Decision List"
[docs] class DetailsMacro(ContainerElement): """A details macro element for collapsible content sections.""" is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of details macro.""" content = super().to_text() return f"📋 Details: {content}" if content else "📋 Details"
[docs] class DecisionListItem(ContainerElement): """A decision item element.""" local_id: str | None = None state: DecisionListItemState | None = None is_block_level: bool = True
[docs] def to_text(self) -> str: """Generate text representation of decision item.""" content = super().to_text() if self.state == DecisionListItemState.DECIDED: return f"✅ {content}" if content else "✅" else: return f"âŗ {content}" if content else "âŗ"
[docs] class Text(Node): """A node containing plain text content.""" text: str
[docs] def to_text(self) -> str: return html.unescape(self.text)