Examples

This section provides comprehensive examples of using the Confluence Content Parser for common tasks.

Basic Parsing and Text Extraction

This example demonstrates the fundamental parsing capabilities and text extraction:

Basic Usage Example
#!/usr/bin/env python3
"""
Basic usage example for Confluence Content Parser.

This example demonstrates core parsing capabilities including:
- Text formatting (bold, italic, code)
- Links and references
- Status macros
- Tables
- Lists and task lists
- Details macros with placeholders
"""

from confluence_content_parser import ConfluenceParser


def main():
    confluence_content = """
    <h1>Project Documentation</h1>

    <p>This document demonstrates <strong>basic parsing</strong> of Confluence content with <em>formatting</em> and <code>inline code</code>.</p>

    <h2>Project Status</h2>
    <p>Current status: <ac:structured-macro ac:name="status">
        <ac:parameter ac:name="title">In Progress</ac:parameter>
        <ac:parameter ac:name="colour">Yellow</ac:parameter>
    </ac:structured-macro></p>

    <h2>Team Information</h2>
    <ac:structured-macro ac:name="details">
        <ac:rich-text-body>
            <table>
                <tr><th>Role</th><th>Assignee</th></tr>
                <tr><td>Project Lead</td><td><ac:placeholder>@ mention lead</ac:placeholder></td></tr>
                <tr><td>Developer</td><td><ac:placeholder>@ mention developer</ac:placeholder></td></tr>
                <tr><td>QA Engineer</td><td><ac:placeholder>@ mention qa</ac:placeholder></td></tr>
            </table>
        </ac:rich-text-body>
    </ac:structured-macro>

    <h2>Tasks</h2>
    <ac:task-list>
        <ac:task>
            <ac:task-id>1</ac:task-id>
            <ac:task-status>complete</ac:task-status>
            <ac:task-body>Set up project repository</ac:task-body>
        </ac:task>
        <ac:task>
            <ac:task-id>2</ac:task-id>
            <ac:task-status>incomplete</ac:task-status>
            <ac:task-body>Implement core features</ac:task-body>
        </ac:task>
    </ac:task-list>

    <h2>External Resources</h2>
    <ul>
        <li>Documentation: <ac:link><ri:url ri:value="https://docs.example.com"/></ac:link></li>
        <li>Repository: <ac:link><ri:url ri:value="https://github.com/example/project"/></ac:link></li>
    </ul>
    """

    # Parse the content
    parser = ConfluenceParser()
    document = parser.parse(confluence_content)

    print("=== BASIC CONFLUENCE PARSING EXAMPLE ===\n")

    # Get clean text output
    print("1. DOCUMENT TEXT:")
    print(document.text)
    print("\n" + "=" * 50 + "\n")

    # Extract specific elements using find_all
    print("2. HEADINGS:")
    from confluence_content_parser import HeadingElement

    headings = document.find_all(HeadingElement)
    for heading in headings:
        print(f"  H{heading.type.value[-1]}: {heading.to_text()}")
    print()

    print("3. STATUS ELEMENTS:")
    from confluence_content_parser import StatusMacro

    status_elements = document.find_all(StatusMacro)
    for status in status_elements:
        print(f"  {status.to_text()}")
    print()

    print("4. TABLES:")
    from confluence_content_parser import Table

    tables = document.find_all(Table)
    for i, table in enumerate(tables, 1):
        print(f"  Table {i}: {len(table.children)} rows")
        print(f"    Content: {table.to_text()}")
    print()

    print("5. LINKS:")
    from confluence_content_parser import LinkElement

    links = document.find_all(LinkElement)
    for link in links:
        print(f"  {link.to_text()}")
    print()

    print("6. TASK LISTS:")
    from confluence_content_parser import ListElement, ListType

    lists = document.find_all(ListElement)
    task_lists = [list_element for list_element in lists if list_element.type == ListType.TASK]
    for task_list in task_lists:
        print(f"  Tasks: {task_list.to_text()}")
    print()

    print("7. PLACEHOLDER ELEMENTS:")
    from confluence_content_parser import PlaceholderElement

    placeholders = document.find_all(PlaceholderElement)
    for placeholder in placeholders:
        print(f"  {placeholder.to_text()}")
    print()

    print("8. DETAILS MACROS:")
    from confluence_content_parser import DetailsMacro

    details = document.find_all(DetailsMacro)
    for detail in details:
        print(f"  {detail.to_text()}")
    print()

    print("9. MULTIPLE TYPE SEARCH:")
    # Find multiple element types at once
    headings_multi, status_multi, placeholders_multi = document.find_all(
        HeadingElement, StatusMacro, PlaceholderElement
    )
    print(f"  Found in one search: {len(headings_multi)} headings, {len(status_multi)} status elements, {len(placeholders_multi)} placeholders")

    # Compare with individual searches (should match)
    assert len(headings_multi) == len(headings)
    assert len(status_multi) == len(status_elements)
    assert len(placeholders_multi) == len(placeholders)
    print("  ✓ Results match individual searches")
    print()

    # Document statistics
    all_nodes = document.walk()
    print("10. DOCUMENT STATISTICS:")
    print(f"  Total nodes: {len(all_nodes)}")
    print(f"  Headings: {len(headings)}")
    print(f"  Tables: {len(tables)}")
    print(f"  Links: {len(links)}")
    print(f"  Status elements: {len(status_elements)}")
    print(f"  Placeholders: {len(placeholders)}")
    print(f"  Details macros: {len(details)}")

    # Check for any parsing issues
    diagnostics = document.metadata.get("diagnostics", [])
    if diagnostics:
        print(f"\n11. PARSING DIAGNOSTICS: {diagnostics}")
    else:
        print("\n11. PARSING: No issues detected ✓")


if __name__ == "__main__":
    main()

Output Analysis

When you run the basic example, you’ll see:

  1. Document Text: Clean, formatted text with proper spacing

  2. Element Extraction: Specific element types found and processed

  3. Statistics: Overview of document structure

  4. Diagnostics: Any parsing issues encountered

Advanced Content Processing

Working with Complex Layouts

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import LayoutElement, LayoutSection, LayoutCell

# Complex layout content
layout_content = """
<ac:layout>
    <ac:layout-section ac:type="two_equal">
        <ac:layout-cell>
            <h2>Left Column</h2>
            <p>Content for the left side.</p>
        </ac:layout-cell>
        <ac:layout-cell>
            <h2>Right Column</h2>
            <p>Content for the right side.</p>
        </ac:layout-cell>
    </ac:layout-section>
</ac:layout>
"""

parser = ConfluenceParser()
document = parser.parse(layout_content)

# Find layout structure
layouts = document.find_all(LayoutElement)
for layout in layouts:
    sections = layout.find_all(LayoutSection)
    for section in sections:
        print(f"Section type: {section.section_type.value}")
        cells = section.find_all(LayoutCell)
        print(f"Number of cells: {len(cells)}")

Processing Macros and Special Content

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import PanelMacro, CodeMacro, ExpandMacro

macro_content = """
<ac:structured-macro ac:name="info">
    <ac:rich-text-body>
        <p>This is important information.</p>
    </ac:rich-text-body>
</ac:structured-macro>

<ac:structured-macro ac:name="code">
    <ac:parameter ac:name="language">python</ac:parameter>
    <ac:plain-text-body>
def hello_world():
    print("Hello, World!")
    </ac:plain-text-body>
</ac:structured-macro>

<ac:structured-macro ac:name="expand">
    <ac:parameter ac:name="title">Click to expand</ac:parameter>
    <ac:rich-text-body>
        <p>Hidden content here.</p>
    </ac:rich-text-body>
</ac:structured-macro>
"""

parser = ConfluenceParser()
document = parser.parse(macro_content)

# Process different macro types
panels = document.find_all(PanelMacro)
for panel in panels:
    print(f"Panel type: {panel.type.value}")
    print(f"Content: {panel.to_text()}")

code_blocks = document.find_all(CodeMacro)
for code in code_blocks:
    print(f"Language: {code.language}")
    print(f"Code: {code.code}")

expand_sections = document.find_all(ExpandMacro)
for expand in expand_sections:
    print(f"Title: {expand.title}")
    print(f"Content: {expand.to_text()}")

Table Processing

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import Table, TableRow, TableCell

table_content = """
<table>
    <tr>
        <th>Name</th>
        <th>Role</th>
        <th>Department</th>
    </tr>
    <tr>
        <td>John Doe</td>
        <td>Developer</td>
        <td>Engineering</td>
    </tr>
    <tr>
        <td>Jane Smith</td>
        <td>Designer</td>
        <td>UX</td>
    </tr>
</table>
"""

parser = ConfluenceParser()
document = parser.parse(table_content)

# Extract table data
tables = document.find_all(Table)
for table in tables:
    rows = table.find_all(TableRow)

    for i, row in enumerate(rows):
        cells = row.find_all(TableCell)
        cell_data = []

        for cell in cells:
            cell_text = cell.to_text()
            if cell.is_header:
                cell_text = f"**{cell_text}**"  # Mark headers
            cell_data.append(cell_text)

        print(f"Row {i + 1}: {' | '.join(cell_data)}")

Task List Processing

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import ListElement, ListItem, ListType, TaskListItemStatus

task_content = """
<ac:task-list>
    <ac:task>
        <ac:task-id>1</ac:task-id>
        <ac:task-status>complete</ac:task-status>
        <ac:task-body>Complete project setup</ac:task-body>
    </ac:task>
    <ac:task>
        <ac:task-id>2</ac:task-id>
        <ac:task-status>incomplete</ac:task-status>
        <ac:task-body>Write documentation</ac:task-body>
    </ac:task>
</ac:task-list>
"""

parser = ConfluenceParser()
document = parser.parse(task_content)

# Process task lists
lists = document.find_all(ListElement)
task_lists = [lst for lst in lists if lst.type == ListType.TASK]

for task_list in task_lists:
    print("Task List:")
    items = task_list.find_all(ListItem)

    for item in items:
        status_symbol = "✓" if item.status == TaskListItemStatus.COMPLETE else "○"
        task_text = item.to_text()
        print(f"  {status_symbol} {task_text}")

Error Handling and Diagnostics

Diagnostics and Error Handling
#!/usr/bin/env python3
"""
Diagnostics usage example for Confluence Content Parser.

This example demonstrates:
- How to handle parsing errors and diagnostics
- Unknown elements and macros
- Unicode and encoding issues
- Best practices for robust parsing
- Troubleshooting common issues
"""

from confluence_content_parser import ConfluenceParser


def main():
    problematic_content = """
    <h1>Diagnostics Example</h1>

    <p>This content contains various elements that will generate diagnostics:</p>

    <!-- Unknown macro that doesn't exist -->
    <ac:structured-macro ac:name="unknown-macro" ac:schema-version="1">
        <ac:parameter ac:name="param1">value1</ac:parameter>
        <ac:rich-text-body>
            <p>This macro is not implemented</p>
        </ac:rich-text-body>
    </ac:structured-macro>

    <!-- Known elements -->
    <ac:structured-macro ac:name="details" ac:schema-version="1">
        <ac:rich-text-body>
            <table>
                <tr><th>Field</th><th>Value</th></tr>
                <tr><td>Status</td><td><ac:structured-macro ac:name="status">
                    <ac:parameter ac:name="title">Active</ac:parameter>
                    <ac:parameter ac:name="colour">Green</ac:parameter>
                </ac:structured-macro></td></tr>
            </table>
        </ac:rich-text-body>
    </ac:structured-macro>

    <!-- Links to various resources -->
    <h2>Links Example</h2>
    <ul>
        <li>External URL: <ac:link><ri:url ri:value="https://example.com"/></ac:link></li>
        <li>Page link: <ac:link><ri:page ri:space-key="DOC" ri:content-title="User Guide"/></ac:link></li>
        <li>User mention: <ac:link><ri:user ri:account-id="user123"/></ac:link></li>
        <li>Attachment: <ac:link><ri:attachment ri:filename="document.pdf"/></ac:link></li>
    </ul>

    <!-- Task list -->
    <h2>Tasks</h2>
    <ac:task-list>
        <ac:task>
            <ac:task-id>task1</ac:task-id>
            <ac:task-status>complete</ac:task-status>
            <ac:task-body>Review documentation</ac:task-body>
        </ac:task>
        <ac:task>
            <ac:task-id>task2</ac:task-id>
            <ac:task-status>incomplete</ac:task-status>
            <ac:task-body><ac:placeholder>Add task description here</ac:placeholder></ac:task-body>
        </ac:task>
    </ac:task-list>

    <!-- Unknown elements (these will be skipped) -->
    <unknown-element>This should be skipped</unknown-element>

    <!-- Inline comment marker (should be skipped) -->
    <p>Some text with <ac:inline-comment-marker ac:ref="comment-123">inline comment</ac:inline-comment-marker></p>

    <!-- Panel with content -->
    <ac:structured-macro ac:name="panel" ac:schema-version="1">
        <ac:parameter ac:name="title">Important Note</ac:parameter>
        <ac:parameter ac:name="bgColor">#FFF2CC</ac:parameter>
        <ac:rich-text-body>
            <p>This is a panel with <strong>formatted content</strong> and placeholders:</p>
            <p><ac:placeholder>Add important information here</ac:placeholder></p>
        </ac:rich-text-body>
    </ac:structured-macro>
    """

    print("=== DIAGNOSTICS EXAMPLE ===\n")

    # Parse with diagnostics enabled (default)
    print("1. PARSING WITH DIAGNOSTICS:")
    parser = ConfluenceParser(raise_on_finish=False)  # Don't raise errors, collect diagnostics
    doc = parser.parse(problematic_content)

    print(f"   Document parsed successfully: {doc.root is not None}")
    print(f"   Total elements found: {len(doc.walk())}")
    print()

    # Check diagnostics
    print("2. PARSING DIAGNOSTICS:")
    diagnostics = doc.metadata.get("diagnostics", [])
    if diagnostics:
        print(f"   Found {len(diagnostics)} diagnostic messages:")
        for i, diag in enumerate(diagnostics, 1):
            print(f"     {i}. {diag}")
    else:
        print("   No diagnostic messages (all elements parsed successfully)")
    print()

    # Analyze what was successfully parsed
    print("3. SUCCESSFULLY PARSED ELEMENTS:")

    # Count different types of elements
    from confluence_content_parser import (
        DetailsMacro,
        HeadingElement,
        LinkElement,
        ListElement,
        PanelMacro,
        PlaceholderElement,
        StatusMacro,
        Table,
    )

    element_counts = {
        "Headings": len(doc.find_all(HeadingElement)),
        "Status macros": len(doc.find_all(StatusMacro)),
        "Details macros": len(doc.find_all(DetailsMacro)),
        "Placeholders": len(doc.find_all(PlaceholderElement)),
        "Links": len(doc.find_all(LinkElement)),
        "Task lists": len(
            [
                list_element
                for list_element in doc.find_all(ListElement)
                if hasattr(list_element.type, "value") and list_element.type.value == "task-list"
            ]
        ),
        "Panels": len(doc.find_all(PanelMacro)),
        "Tables": len(doc.find_all(Table)),
    }

    for element_type, count in element_counts.items():
        print(f"   {element_type}: {count}")
    print()

    # Link analysis with type breakdown
    print("4. LINK ANALYSIS:")
    links = doc.find_all(LinkElement)
    if links:
        link_types = {}
        for link in links:
            link_type = link.type.value if hasattr(link.type, "value") else str(link.type)
            link_types[link_type] = link_types.get(link_type, 0) + 1

        for link_type, count in link_types.items():
            print(f"   {link_type} links: {count}")

        print("\n   Link details:")
        for i, link in enumerate(links, 1):
            link_text = link.to_text().strip()
            link_type = link.type.value if hasattr(link.type, "value") else str(link.type)
            print(f"     {i}. {link_type}: {link_text}")
    else:
        print("   No links found")
    print()

    # Placeholder analysis
    print("5. PLACEHOLDER ANALYSIS:")
    placeholders = doc.find_all(PlaceholderElement)
    if placeholders:
        print(f"   Found {len(placeholders)} placeholders:")
        for i, placeholder in enumerate(placeholders, 1):
            print(f"     {i}. {placeholder.to_text()}")
    else:
        print("   No placeholders found")
    print()

    # Document text extraction
    print("6. CLEAN TEXT OUTPUT:")
    print("   " + "=" * 47)
    clean_text = doc.text
    # Show first few lines of clean text
    text_lines = clean_text.split("\n")[:10]
    for line in text_lines:
        if line.strip():
            print(f"   {line.strip()}")
    if len(clean_text.split("\n")) > 10:
        print("   ... (truncated)")
    print("   " + "=" * 47)
    print()

    # Error handling example
    print("7. ERROR HANDLING EXAMPLE:")
    try:
        # Try parsing with raise_on_finish=True
        strict_parser = ConfluenceParser(raise_on_finish=True)
        strict_parser.parse(problematic_content)
        print("   Strict parsing succeeded (no unknown elements)")
    except Exception as e:
        print(f"   Strict parsing failed as expected: {type(e).__name__}")
        print(f"   Error details: {str(e)}")
    print()

    # Best practices
    print("8. PARSING STATISTICS:")
    total_elements = len(doc.walk())
    successful_elements = total_elements
    failed_elements = len(diagnostics)

    if total_elements > 0:
        success_rate = ((successful_elements) / (successful_elements + failed_elements)) * 100
        print(f"   Total parsed elements: {successful_elements}")
        print(f"   Failed/unknown elements: {failed_elements}")
        print(f"   Success rate: {success_rate:.1f}%")

    print(f"   Document length: {len(clean_text)} characters")
    print(f"   Non-empty lines: {len([line for line in clean_text.split('\\n') if line.strip()])}")


if __name__ == "__main__":
    main()

Custom Content Analysis

Document Statistics

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import (
    HeadingElement, LinkElement, Image, Table,
    ListElement, PanelMacro, CodeMacro
)

def analyze_document(content):
    """Analyze a Confluence document and return statistics."""
    parser = ConfluenceParser()
    document = parser.parse(content)

    # Efficient analysis with multiple-type searches
    headings, links, images = document.find_all(HeadingElement, LinkElement, Image)
    tables, lists, panels, codes = document.find_all(Table, ListElement, PanelMacro, CodeMacro)

    stats = {
        'total_nodes': len(list(document.walk())),
        'headings': len(headings),
        'links': len(links),
        'images': len(images),
        'tables': len(tables),
        'lists': len(lists),
        'panels': len(panels),
        'code_blocks': len(codes),
        'text_length': len(document.text),
        'diagnostics': document.metadata.get('diagnostics', [])
    }

    return stats

# Usage
content = """<h1>Sample</h1><p>Text with <strong>formatting</strong></p>"""
stats = analyze_document(content)

print("Document Analysis:")
for key, value in stats.items():
    if key != 'diagnostics':
        print(f"  {key.replace('_', ' ').title()}: {value}")

if stats['diagnostics']:
    print("  Parsing Issues:")
    for diagnostic in stats['diagnostics']:
        print(f"    - {diagnostic}")

Content Search and Filtering

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import HeadingElement, Text, LinkElement

def search_content(document, search_term):
    """Search for content containing a specific term."""
    matching_nodes = []

    for node in document.walk():
        text_content = node.to_text().lower()
        if search_term.lower() in text_content:
            matching_nodes.append({
                'type': type(node).__name__,
                'content': node.to_text()[:100] + '...' if len(node.to_text()) > 100 else node.to_text(),
                'node': node
            })

    return matching_nodes

def find_external_links(document):
    """Find all external links in the document."""
    from confluence_content_parser import LinkType

    links = document.find_all(LinkElement)
    external_links = []

    for link in links:
        if link.type == LinkType.EXTERNAL and link.href:
            external_links.append({
                'url': link.href,
                'text': link.to_text(),
                'context': link.to_text()
            })

    return external_links

# Usage example
parser = ConfluenceParser()
document = parser.parse(confluence_content)

# Search for specific content
api_references = search_content(document, 'API')
print(f"Found {len(api_references)} API references")

# Find external links
external_links = find_external_links(document)
for link in external_links:
    print(f"External link: {link['url']} ({link['text']})")

Content Transformation

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import HeadingElement, CodeMacro, PanelMacro

def convert_to_markdown(document):
    """Convert a Confluence document to Markdown format."""
    markdown_lines = []

    for node in document.walk():
        if isinstance(node, HeadingElement):
            level = int(node.type.value[1])  # Extract number from h1, h2, etc.
            prefix = '#' * level
            markdown_lines.append(f"{prefix} {node.to_text()}")

        elif isinstance(node, CodeMacro):
            language = node.language or ''
            markdown_lines.append(f"```{language}")
            markdown_lines.append(node.code)
            markdown_lines.append("```")

        elif isinstance(node, PanelMacro):
            panel_type = node.type.value.upper()
            content = ' '.join(child.to_text() for child in node.children)
            markdown_lines.append(f"> **{panel_type}**: {content}")

    return '\n\n'.join(markdown_lines)

# Usage
parser = ConfluenceParser()
document = parser.parse(confluence_content)
markdown = convert_to_markdown(document)
print(markdown)

Performance Optimization

Streaming Large Documents

from confluence_content_parser import ConfluenceParser
from confluence_content_parser import HeadingElement

def process_large_document(xml_content, chunk_size=1000):
    """Process large documents in chunks to manage memory."""
    parser = ConfluenceParser()
    document = parser.parse(xml_content)

    # Efficient extraction of key elements for document outline
    headings, panels, codes = document.find_all(HeadingElement, PanelMacro, CodeMacro)
    outline = []

    for heading in headings:
        level = int(heading.type.value[1])
        text = heading.to_text()
        outline.append(f"{'  ' * (level - 1)}- {text}")

    # Add summary of other content
    if panels or codes:
        outline.append(f"Content: {len(panels)} panels, {len(codes)} code blocks")

    return outline

def extract_text_efficiently(document):
    """Extract text without loading entire tree into memory."""
    text_chunks = []
    current_chunk = []
    current_size = 0
    max_chunk_size = 1000

    for node in document.walk():
        text = node.to_text()
        if text.strip():
            current_chunk.append(text)
            current_size += len(text)

            if current_size >= max_chunk_size:
                text_chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_size = 0

    if current_chunk:
        text_chunks.append(' '.join(current_chunk))

    return text_chunks

Integration Examples

Document Validation

from confluence_content_parser import ConfluenceParser, ParsingError

def validate_confluence_document(xml_content):
    """Validate and report on Confluence document quality."""
    parser = ConfluenceParser(raise_on_finish=False)

    try:
        document = parser.parse(xml_content)

        validation_results = {
            'valid': True,
            'warnings': document.metadata.get('diagnostics', []),
            'statistics': {
                'total_nodes': len(list(document.walk())),
                'text_length': len(document.text)
            }
        }

        return validation_results

    except Exception as e:
        return {
            'valid': False,
            'error': str(e),
            'warnings': [],
            'statistics': {}
        }

# Usage
validation = validate_confluence_document(xml_content)
if validation['valid']:
    print("Document is valid")
    if validation['warnings']:
        print(f"Warnings: {validation['warnings']}")
else:
    print(f"Validation failed: {validation['error']}")

Batch Processing

from confluence_content_parser import ConfluenceParser
import concurrent.futures
import os

def process_single_file(file_path):
    """Process a single Confluence XML file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        parser = ConfluenceParser(raise_on_finish=False)
        document = parser.parse(content)

        return {
            'file': file_path,
            'success': True,
            'text_length': len(document.text),
            'diagnostics': document.metadata.get('diagnostics', [])
        }

    except Exception as e:
        return {
            'file': file_path,
            'success': False,
            'error': str(e)
        }

def batch_process_files(file_paths, max_workers=4):
    """Process multiple Confluence files in parallel."""
    results = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {
            executor.submit(process_single_file, file_path): file_path
            for file_path in file_paths
        }

        for future in concurrent.futures.as_completed(future_to_file):
            result = future.result()
            results.append(result)

    return results

# Usage
xml_files = ['doc1.xml', 'doc2.xml', 'doc3.xml']
results = batch_process_files(xml_files)

for result in results:
    if result['success']:
        print(f"Processed {result['file']}: {result['text_length']} chars")
    else:
        print(f"Failed {result['file']}: {result['error']}")

Testing Patterns

Unit Testing with Parser

import unittest
from confluence_content_parser import ConfluenceParser
from confluence_content_parser import HeadingElement, PanelMacro

class TestConfluenceParser(unittest.TestCase):

    def setUp(self):
        self.parser = ConfluenceParser()

    def test_heading_parsing(self):
        """Test that headings are parsed correctly."""
        content = "<h1>Main Title</h1><h2>Subtitle</h2>"
        document = self.parser.parse(content)

        headings = document.find_all(HeadingElement)
        self.assertEqual(len(headings), 2)
        self.assertEqual(headings[0].to_text(), "Main Title")
        self.assertEqual(headings[1].to_text(), "Subtitle")

    def test_multiple_type_search(self):
        """Test multiple type search functionality."""
        content = '''
        <h1>Title</h1>
        <ac:structured-macro ac:name="info">
            <ac:rich-text-body><p>Info content</p></ac:rich-text-body>
        </ac:structured-macro>
        '''
        document = self.parser.parse(content)

        # Test multiple type search
        headings, panels = document.find_all(HeadingElement, PanelMacro)
        self.assertEqual(len(headings), 1)
        self.assertEqual(len(panels), 1)
        self.assertEqual(headings[0].to_text(), "Title")
        self.assertIn("Info content", panels[0].to_text())

    def test_panel_macro(self):
        """Test panel macro parsing."""
        content = '''
        <ac:structured-macro ac:name="info">
            <ac:rich-text-body>
                <p>Important information</p>
            </ac:rich-text-body>
        </ac:structured-macro>
        '''
        document = self.parser.parse(content)

        panels = document.find_all(PanelMacro)
        self.assertEqual(len(panels), 1)
        self.assertIn("Important information", panels[0].to_text())

    def test_error_handling(self):
        """Test error handling for malformed content."""
        malformed_content = "<h1>Unclosed heading"

        # Should not raise exception with raise_on_finish=False
        parser = ConfluenceParser(raise_on_finish=False)
        document = parser.parse(malformed_content)

        # Check diagnostics
        diagnostics = document.metadata.get('diagnostics', [])
        self.assertIsInstance(diagnostics, list)

if __name__ == '__main__':
    unittest.main()

See Also