Source code for speechmarkdown.formatters.text

import re
from typing import List, Optional, Union

from speechmarkdown.formatters.base import FormatterBase
from speechmarkdown.options import SpeechOptions
from speechmarkdown.parser import ASTNode



[docs]
class TextFormatter(FormatterBase):
    """
    Formatter for outputting plain text instead of SSML elements.
    """

    def __init__(self, options: SpeechOptions) -> None:
        """
        Initialize the TextFormatter.

        Args:
            options (SpeechOptions): Setup options for formatting output.
        """
        super().__init__(options)


[docs]
    def format(self, ast: Union[ASTNode, List[ASTNode]]) -> str:
        """
        Format the AST to basic plain text strings without SSML tags.

        Args:
            ast (Union[ASTNode, List[ASTNode]]): The root node(s) to convert.

        Returns:
            str: Output formatted as plain text.
        """
        lines: List[str] = []
        if isinstance(ast, list):
            self.addArray(ast, lines)
        else:
            self.formatFromAst(ast, lines)

        txt = "".join(lines).strip()
        # replace multiple whitespace with a single space
        txt = re.sub(r"  +", " ", txt)
        return txt



[docs]
    def formatFromAst(
        self, ast: ASTNode, lines: Optional[List[str]] = None
    ) -> List[str]:
        """
        Process single nodes handling text appending correctly.

        Args:
            ast (ASTNode): Focus AST element to render text.
            lines (Optional[List[str]]): The collector line list.

        Returns:
            List[str]: Refreshed output chunks.
        """
        out_lines: List[str] = lines if lines is not None else []

        if not hasattr(ast, "name"):
            return out_lines

        if ast.name in ("document", "paragraph", "simpleLine"):
            self.processAst(ast.children, out_lines)
            return out_lines

        elif ast.name == "lineEnd":
            out_lines.append(ast.allText)
            return out_lines

        elif ast.name == "emptyLine":
            if getattr(self.options, "preserveEmptyLines", True):
                out_lines.append(ast.allText)
            return out_lines

        elif ast.name in (
            "plainText",
            "plainTextSpecialChars",
            "plainTextEmphasis",
            "plainTextPhone",
            "plainTextModifier",
        ):
            out_lines.append(ast.allText)
            return out_lines

        elif ast.name in ("shortIpa", "shortSub"):
            text_node = next(
                (
                    c
                    for c in ast.children
                    if c.name in ("parenthesized", "plainTextModifier")
                ),
                None,
            )
            text = (
                self.extractParenthesizedText(text_node)
                if text_node and text_node.name == "parenthesized"
                else getattr(text_node, "allText", "")
            )
            if text:
                out_lines.append(text)
            return out_lines

        elif ast.name == "bareIpa":
            phoneme_node = next(
                (c for c in ast.children if c.name == "shortIpaValue"), None
            )
            phoneme = getattr(phoneme_node, "allText", "")
            if phoneme:
                out_lines.append(phoneme)
            return out_lines

        elif ast.name == "audio":
            return out_lines

        else:
            self.processAst(ast.children, out_lines)
            return out_lines



[docs]
    def extractParenthesizedText(self, node: ASTNode) -> str:
        if not node or not getattr(node, "allText", None) or len(node.allText) < 2:
            return ""
        content = node.allText[1:-1]
        return content.strip()