Source code for speechmarkdown.parser

import re
from typing import Any, Dict, List

import pyparsing as pp

pp.ParserElement.enable_packrat()



[docs]
class ASTNode:
    """
    Node for the Abstract Syntax Tree generated by the syntax parser.
    """

    def __init__(self, name: str, allText: str, children: List[Any]) -> None:
        """
        Initialize an ASTNode.

        Args:
            name (str): Type name of the node.
            allText (str): Entire matched text block.
            children (List[Any]): List of child tokens/nodes.
        """

[docs]
        self.name = name


[docs]
        self.allText = allText


[docs]
        self.children = children



[docs]
    def __repr__(self) -> str:
        return f"ASTNode({self.name}, {repr(self.allText)}, {self.children})"



[docs]
    def to_dict(self) -> Dict[str, Any]:
        """
        Convert the AST node into a dictionary representation.

        Returns:
            Dict[str, Any]: Basic dictionary tree.
        """
        return {
            "name": self.name,
            "allText": self.allText,
            "children": [
                c.to_dict() if isinstance(c, ASTNode) else c for c in self.children
            ],
        }





[docs]
def ast(name: str, expr: pp.ParserElement) -> pp.ParserElement:
    """
    Wrap a parser element to produce an ASTNode as its parsed action result.

    Args:
        name (str): Name parameter for the ASTNode.
        expr (pp.ParserElement): The PyParsing expression to apply.

    Returns:
        pp.ParserElement: The wrapped sequence.
    """
    marker = pp.Empty().set_parse_action(lambda s, loc, t: loc)
    seq = marker + expr + marker

    def pa(s: str, loc: int, toks: pp.ParseResults) -> ASTNode:
        start = toks[0]
        end = toks[-1]
        allText = s[start:end]
        children = [t for t in toks[1:-1] if isinstance(t, ASTNode)]
        return ASTNode(name, allText, children)

    return seq.set_parse_action(pa).set_name(name)




[docs]
def parenthesized(rule: pp.ParserElement) -> pp.ParserElement:
    return (
        pp.Suppress("(")
        + pp.Optional(pp.Char(" \t")[1, ...])
        + rule
        + pp.Optional(pp.Char(" \t")[1, ...])
        + pp.Suppress(")")
    )




[docs]
class SpeechMarkdownParser:
    """
    Parser for Speech Markdown syntax utilizing pyparsing.
    """

    def __init__(self) -> None:
        """
        Initializes the grammar logic upon setup.
        """

[docs]
        self.grammar = self._build_grammar()


    def _build_grammar(self) -> pp.ParserElement:
        """
        Constructs and returns the root parser element for Speech Markdown.

        Returns:
            pp.ParserElement: The PyParsing document definition.
        """
        ws = pp.Char(" \t")[1, ...]
        optWs = pp.Optional(ws)

        specialCharSet = "[]()"
        specialCharSetEmphasis = "[]()*~`@#\\_!+-/"

        nonSpecialChar = ~(pp.LineEnd()) + pp.Regex(
            f'[^{"".join(re.escape(c) for c in specialCharSetEmphasis)}]'
        )
        nonSpecialCharEmphasis = ~(pp.LineEnd()) + pp.Regex(
            f'[^{"".join(re.escape(c) for c in specialCharSet)}]'
        )

        digits = pp.Word(pp.nums)
        letters = pp.Word(pp.alphas)
        integer = pp.Word(pp.nums)
        hyphen = pp.Literal("-")

        xsdToken = ast(
            "xsdToken", (digits | letters | pp.Char(specialCharSetEmphasis))[1, ...]
        )

        plainText = ast("plainText", (digits | letters | ws | nonSpecialChar)[1, ...])
        plainTextEmphasis = ast(
            "plainTextEmphasis", (digits | letters | ws | nonSpecialChar)[1, ...]
        )
        plainTextChoice = digits | letters | ws | nonSpecialCharEmphasis
        plainTextModifier = ast("plainTextModifier", plainTextChoice[1, ...])

        plainTextPhone = ast(
            "plainTextPhone", parenthesized(digits) + plainTextChoice[1, ...]
        )

        plainTextSpecialChars = ast(
            "plainTextSpecialChars",
            (
                (pp.Suppress("(") + plainTextChoice + pp.Suppress(") "))
                | (pp.Suppress("[") + plainTextChoice + pp.Suppress("] "))
                | pp.Char(specialCharSetEmphasis)[1, ...]
            )[1, ...],
        )

        # Break
        timeUnit = ast("timeUnit", pp.Literal("ms") | pp.Literal("s"))
        fraction = pp.Literal(".") + pp.Word(pp.nums)[0, ...]
        number = ast("number", integer + pp.Optional(fraction))
        time = ast("time", number + timeUnit)
        shortBreak = ast("shortBreak", pp.Suppress("[") + time + pp.Suppress("]"))

        # Emphasis
        notLetterChar = ~(letters | digits)

        shortEmphasisModerate = ast(
            "shortEmphasisModerate",
            notLetterChar
            + pp.Suppress("+")
            + plainTextEmphasis
            + pp.Suppress("+")
            + notLetterChar,
        )
        shortEmphasisStrong = ast(
            "shortEmphasisStrong",
            notLetterChar
            + pp.Suppress("++")
            + plainTextEmphasis
            + pp.Suppress("++")
            + notLetterChar,
        )
        shortEmphasisNone = ast(
            "shortEmphasisNone",
            notLetterChar
            + pp.Suppress("~")
            + plainTextEmphasis
            + pp.Suppress("~")
            + notLetterChar,
        )
        shortEmphasisReduced = ast(
            "shortEmphasisReduced",
            notLetterChar
            + pp.Suppress("-")
            + plainTextEmphasis
            + pp.Suppress("-")
            + notLetterChar,
        )

        emphasis = (
            shortEmphasisModerate
            | shortEmphasisStrong
            | shortEmphasisNone
            | shortEmphasisReduced
        )

        colon = pp.Suppress(":") + optWs
        semicolon = pp.Suppress(";") + optWs

        modifier_keys = [
            "emphasis",
            "address",
            "number",
            "cardinal",
            "characters",
            "chars",
            "digits",
            "drc",
            "expletive",
            "bleep",
            "fraction",
            "interjection",
            "ordinal",
            "telephone",
            "phone",
            "unit",
            "time",
            "date",
            "whisper",
            "ipa",
            "sub",
            "vol",
            "volume",
            "rate",
            "pitch",
            "timbre",
            "lang",
            "voice",
            "style",
            "role",
            "excited",
            "disappointed",
            "friendly",
            "cheerful",
            "sad",
            "angry",
            "fearful",
            "empathetic",
            "calm",
            "lyrical",
            "hopeful",
            "shouting",
            "whispering",
            "terrified",
            "unfriendly",
            "gentle",
            "serious",
            "depressed",
            "embarrassed",
            "disgruntled",
            "affectionate",
            "envious",
            "chat",
            "customerservice",
            "assistant",
            "poetry-reading",
            "narration-professional",
            "narration-relaxed",
            "newscast-casual",
            "newscast-formal",
            "newscaster",
            "documentary-narration",
            "advertisement_upbeat",
            "sports_commentary",
            "sports_commentary_excited",
        ]

        # Sort keys by length descending to match longest first (e.g. "whispering" before "whisper")
        modifier_keys.sort(key=len, reverse=True)
        textModifierKey = ast(
            "textModifierKey", pp.MatchFirst([pp.Keyword(k) for k in modifier_keys])
        )

        ipaChars = [
            ".",
            "'",
            "æ",
            "͡ʒ",
            "ð",
            "ʃ",
            "͡ʃ",
            "θ",
            "ʒ",
            "ə",
            "ɚ",
            "aɪ",
            "aʊ",
            "ɑ",
            "eɪ",
            "ɝ",
            "ɛ",
            "ɪ",
            "oʊ",
            "ɔ",
            "ɔɪ",
            "ʊ",
            "ʌ",
            "ɒ",
            "ɛə",
            "ɪə",
            "ʊə",
            "ˈ",
            "ˌ",
            "ŋ",
            "ɹ",
        ]

        # Create a matching string for Regex (escape special regex chars)
        ipaCharsEscaped = "".join([re.escape(c) for c in ([".", ":", "'"] + ipaChars)])

        shortIpaValue = ast(
            "shortIpaValue", pp.Regex(f"[a-zA-Z0-9- {ipaCharsEscaped}]+")
        )
        shortIpa = ast(
            "shortIpa",
            parenthesized(plainTextModifier)
            + pp.Suppress("/")
            + shortIpaValue
            + pp.Suppress("/"),
        )

        shortSubValue = ast("shortSubValue", ~(pp.LineEnd()) + pp.Regex(r"[^}]+"))
        shortSub = ast(
            "shortSub",
            parenthesized(plainTextModifier)
            + pp.Suppress("{")
            + shortSubValue
            + pp.Suppress("}"),
        )
        bareIpa = ast("bareIpa", pp.Suppress("/") + shortIpaValue + pp.Suppress("/"))

        percentChangeChars = r"\+\-\d\%"
        textModifierText = ast(
            "textModifierText",
            pp.Regex(f"[a-zA-Z0-9- {ipaCharsEscaped}{percentChangeChars}]+"),
        )
        textModifierTextDoubleQuote = ast(
            "textModifierTextDoubleQuote",
            pp.Regex(f"[a-zA-Z0-9- '{ipaCharsEscaped}{percentChangeChars}]+"),
        )

        singleQuotedStr = pp.Suppress("'") + textModifierText + pp.Suppress("'")
        doubleQuotedStr = (
            pp.Suppress('"') + textModifierTextDoubleQuote + pp.Suppress('"')
        )

        textModifierValue = colon + (singleQuotedStr | doubleQuotedStr)
        textModifierKeyOptionalValue = ast(
            "textModifierKeyOptionalValue",
            textModifierKey + pp.Optional(textModifierValue),
        )

        modifier = (
            pp.Suppress("[")
            + pp.DelimitedList(textModifierKeyOptionalValue + optWs, delim=semicolon)
            + pp.Suppress("]")
        )

        textText = parenthesized(plainTextModifier)
        textTextPhone = parenthesized(plainTextPhone)
        textModifier = ast("textModifier", (textTextPhone | textText) + modifier)

        urlSpecialChar = pp.Char(":/.-_~?#[]@!+,;%=()&")
        url = ast("url", (digits | letters | urlSpecialChar)[1, ...])
        audio = ast(
            "audio",
            pp.Suppress("!")
            + pp.Optional(parenthesized(pp.Optional(plainTextModifier)))
            + pp.Suppress("[")
            + (
                pp.Suppress("'") + url + pp.Suppress("'")
                | pp.Suppress('"') + url + pp.Suppress('"')
            )
            + pp.Suppress("]"),
        )

        # Section
        sectionModifierKey = ast(
            "sectionModifierKey", textModifierKey
        )  # Same keywords!
        sectionModifierText = ast(
            "sectionModifierText", (digits | letters | hyphen)[1, ...]
        )
        sectionModifierValue = colon + (
            pp.Suppress("'") + sectionModifierText + pp.Suppress("'")
            | pp.Suppress('"') + sectionModifierText + pp.Suppress('"')
        )
        sectionModifierKeyOptionalValue = ast(
            "sectionModifierKeyOptionalValue",
            sectionModifierKey + pp.Optional(sectionModifierValue),
        )
        sectionModifier = (
            pp.Suppress("[")
            + pp.DelimitedList(sectionModifierKeyOptionalValue + optWs, delim=semicolon)
            + pp.Suppress("]")
        )
        section = ast("section", pp.Suppress("#") + sectionModifier)

        # Breaks
        breakStrengthValue = ast(
            "breakStrengthValue",
            pp.Keyword("none")
            | pp.Keyword("x-weak")
            | pp.Keyword("weak")
            | pp.Keyword("medium")
            | pp.Keyword("strong")
            | pp.Keyword("x-strong"),
        )
        breakValue = ast("breakValue", breakStrengthValue | time)
        break_tag = ast(
            "break",
            pp.Suppress("[break:")
            + (
                pp.Suppress("'") + breakValue + pp.Suppress("'")
                | pp.Suppress('"') + breakValue + pp.Suppress('"')
            )
            + pp.Suppress("]"),
        )

        markTag = ast(
            "markTag",
            pp.Suppress("[mark:")
            + (
                pp.Suppress("'") + xsdToken + pp.Suppress("'")
                | pp.Suppress('"') + xsdToken + pp.Suppress('"')
            )
            + pp.Suppress("]"),
        )

        any_char = pp.Regex(r".", flags=re.DOTALL)

        inline = ~(pp.LineEnd()) + (
            bareIpa
            | shortIpa
            | shortSub
            | textModifier
            | emphasis
            | shortBreak
            | break_tag
            | audio
            | markTag
            | plainTextSpecialChars
            | plainText
            | any_char
        )

        emptyLine = ast("emptyLine", pp.Regex(r"[ \t]*") + pp.LineEnd())
        lineEnd = pp.LineEnd() | pp.StringEnd()
        restOfLine = inline[0, ...] + lineEnd
        simpleLine = ast("simpleLine", ~emptyLine + ~pp.StringEnd() + restOfLine)
        paragraph = ast("paragraph", simpleLine[1, ...])

        content = section | paragraph | emptyLine
        document = ast("document", content[0, ...])

        return document


[docs]
    def parse(self, text: str) -> ASTNode:
        """
        Parse raw Speech Markdown text to an AST.

        Args:
            text (str): Speech Markdown string.

        Returns:
            ASTNode: The root of the generated syntax tree.

        Raises:
            ValueError: If an unexpected syntax element fails during parse limit logic.
        """
        try:
            from typing import cast

            parsed = self.grammar.parse_string(text, parse_all=True)[0]
            if isinstance(parsed, ASTNode):
                return parsed
            return cast(ASTNode, parsed)
        except pp.ParseException as e:
            raise ValueError(f"Parse error: {e}")