import re
from typing import Any, Dict, List
import pyparsing as pp
pp.ParserElement.enable_packrat()
[docs]
class ASTNode:
"""
Node for the Abstract Syntax Tree generated by the syntax parser.
"""
def __init__(self, name: str, allText: str, children: List[Any]) -> None:
"""
Initialize an ASTNode.
Args:
name (str): Type name of the node.
allText (str): Entire matched text block.
children (List[Any]): List of child tokens/nodes.
"""
[docs]
self.allText = allText
[docs]
self.children = children
[docs]
def __repr__(self) -> str:
return f"ASTNode({self.name}, {repr(self.allText)}, {self.children})"
[docs]
def to_dict(self) -> Dict[str, Any]:
"""
Convert the AST node into a dictionary representation.
Returns:
Dict[str, Any]: Basic dictionary tree.
"""
return {
"name": self.name,
"allText": self.allText,
"children": [
c.to_dict() if isinstance(c, ASTNode) else c for c in self.children
],
}
[docs]
def ast(name: str, expr: pp.ParserElement) -> pp.ParserElement:
"""
Wrap a parser element to produce an ASTNode as its parsed action result.
Args:
name (str): Name parameter for the ASTNode.
expr (pp.ParserElement): The PyParsing expression to apply.
Returns:
pp.ParserElement: The wrapped sequence.
"""
marker = pp.Empty().set_parse_action(lambda s, loc, t: loc)
seq = marker + expr + marker
def pa(s: str, loc: int, toks: pp.ParseResults) -> ASTNode:
start = toks[0]
end = toks[-1]
allText = s[start:end]
children = [t for t in toks[1:-1] if isinstance(t, ASTNode)]
return ASTNode(name, allText, children)
return seq.set_parse_action(pa).set_name(name)
[docs]
def parenthesized(rule: pp.ParserElement) -> pp.ParserElement:
return (
pp.Suppress("(")
+ pp.Optional(pp.Char(" \t")[1, ...])
+ rule
+ pp.Optional(pp.Char(" \t")[1, ...])
+ pp.Suppress(")")
)
[docs]
class SpeechMarkdownParser:
"""
Parser for Speech Markdown syntax utilizing pyparsing.
"""
def __init__(self) -> None:
"""
Initializes the grammar logic upon setup.
"""
[docs]
self.grammar = self._build_grammar()
def _build_grammar(self) -> pp.ParserElement:
"""
Constructs and returns the root parser element for Speech Markdown.
Returns:
pp.ParserElement: The PyParsing document definition.
"""
ws = pp.Char(" \t")[1, ...]
optWs = pp.Optional(ws)
specialCharSet = "[]()"
specialCharSetEmphasis = "[]()*~`@#\\_!+-/"
nonSpecialChar = ~(pp.LineEnd()) + pp.Regex(
f'[^{"".join(re.escape(c) for c in specialCharSetEmphasis)}]'
)
nonSpecialCharEmphasis = ~(pp.LineEnd()) + pp.Regex(
f'[^{"".join(re.escape(c) for c in specialCharSet)}]'
)
digits = pp.Word(pp.nums)
letters = pp.Word(pp.alphas)
integer = pp.Word(pp.nums)
hyphen = pp.Literal("-")
xsdToken = ast(
"xsdToken", (digits | letters | pp.Char(specialCharSetEmphasis))[1, ...]
)
plainText = ast("plainText", (digits | letters | ws | nonSpecialChar)[1, ...])
plainTextEmphasis = ast(
"plainTextEmphasis", (digits | letters | ws | nonSpecialChar)[1, ...]
)
plainTextChoice = digits | letters | ws | nonSpecialCharEmphasis
plainTextModifier = ast("plainTextModifier", plainTextChoice[1, ...])
plainTextPhone = ast(
"plainTextPhone", parenthesized(digits) + plainTextChoice[1, ...]
)
plainTextSpecialChars = ast(
"plainTextSpecialChars",
(
(pp.Suppress("(") + plainTextChoice + pp.Suppress(") "))
| (pp.Suppress("[") + plainTextChoice + pp.Suppress("] "))
| pp.Char(specialCharSetEmphasis)[1, ...]
)[1, ...],
)
# Break
timeUnit = ast("timeUnit", pp.Literal("ms") | pp.Literal("s"))
fraction = pp.Literal(".") + pp.Word(pp.nums)[0, ...]
number = ast("number", integer + pp.Optional(fraction))
time = ast("time", number + timeUnit)
shortBreak = ast("shortBreak", pp.Suppress("[") + time + pp.Suppress("]"))
# Emphasis
notLetterChar = ~(letters | digits)
shortEmphasisModerate = ast(
"shortEmphasisModerate",
notLetterChar
+ pp.Suppress("+")
+ plainTextEmphasis
+ pp.Suppress("+")
+ notLetterChar,
)
shortEmphasisStrong = ast(
"shortEmphasisStrong",
notLetterChar
+ pp.Suppress("++")
+ plainTextEmphasis
+ pp.Suppress("++")
+ notLetterChar,
)
shortEmphasisNone = ast(
"shortEmphasisNone",
notLetterChar
+ pp.Suppress("~")
+ plainTextEmphasis
+ pp.Suppress("~")
+ notLetterChar,
)
shortEmphasisReduced = ast(
"shortEmphasisReduced",
notLetterChar
+ pp.Suppress("-")
+ plainTextEmphasis
+ pp.Suppress("-")
+ notLetterChar,
)
emphasis = (
shortEmphasisModerate
| shortEmphasisStrong
| shortEmphasisNone
| shortEmphasisReduced
)
colon = pp.Suppress(":") + optWs
semicolon = pp.Suppress(";") + optWs
modifier_keys = [
"emphasis",
"address",
"number",
"cardinal",
"characters",
"chars",
"digits",
"drc",
"expletive",
"bleep",
"fraction",
"interjection",
"ordinal",
"telephone",
"phone",
"unit",
"time",
"date",
"whisper",
"ipa",
"sub",
"vol",
"volume",
"rate",
"pitch",
"timbre",
"lang",
"voice",
"style",
"role",
"excited",
"disappointed",
"friendly",
"cheerful",
"sad",
"angry",
"fearful",
"empathetic",
"calm",
"lyrical",
"hopeful",
"shouting",
"whispering",
"terrified",
"unfriendly",
"gentle",
"serious",
"depressed",
"embarrassed",
"disgruntled",
"affectionate",
"envious",
"chat",
"customerservice",
"assistant",
"poetry-reading",
"narration-professional",
"narration-relaxed",
"newscast-casual",
"newscast-formal",
"newscaster",
"documentary-narration",
"advertisement_upbeat",
"sports_commentary",
"sports_commentary_excited",
]
# Sort keys by length descending to match longest first (e.g. "whispering" before "whisper")
modifier_keys.sort(key=len, reverse=True)
textModifierKey = ast(
"textModifierKey", pp.MatchFirst([pp.Keyword(k) for k in modifier_keys])
)
ipaChars = [
".",
"'",
"æ",
"͡ʒ",
"ð",
"ʃ",
"͡ʃ",
"θ",
"ʒ",
"ə",
"ɚ",
"aɪ",
"aʊ",
"ɑ",
"eɪ",
"ɝ",
"ɛ",
"ɪ",
"oʊ",
"ɔ",
"ɔɪ",
"ʊ",
"ʌ",
"ɒ",
"ɛə",
"ɪə",
"ʊə",
"ˈ",
"ˌ",
"ŋ",
"ɹ",
]
# Create a matching string for Regex (escape special regex chars)
ipaCharsEscaped = "".join([re.escape(c) for c in ([".", ":", "'"] + ipaChars)])
shortIpaValue = ast(
"shortIpaValue", pp.Regex(f"[a-zA-Z0-9- {ipaCharsEscaped}]+")
)
shortIpa = ast(
"shortIpa",
parenthesized(plainTextModifier)
+ pp.Suppress("/")
+ shortIpaValue
+ pp.Suppress("/"),
)
shortSubValue = ast("shortSubValue", ~(pp.LineEnd()) + pp.Regex(r"[^}]+"))
shortSub = ast(
"shortSub",
parenthesized(plainTextModifier)
+ pp.Suppress("{")
+ shortSubValue
+ pp.Suppress("}"),
)
bareIpa = ast("bareIpa", pp.Suppress("/") + shortIpaValue + pp.Suppress("/"))
percentChangeChars = r"\+\-\d\%"
textModifierText = ast(
"textModifierText",
pp.Regex(f"[a-zA-Z0-9- {ipaCharsEscaped}{percentChangeChars}]+"),
)
textModifierTextDoubleQuote = ast(
"textModifierTextDoubleQuote",
pp.Regex(f"[a-zA-Z0-9- '{ipaCharsEscaped}{percentChangeChars}]+"),
)
singleQuotedStr = pp.Suppress("'") + textModifierText + pp.Suppress("'")
doubleQuotedStr = (
pp.Suppress('"') + textModifierTextDoubleQuote + pp.Suppress('"')
)
textModifierValue = colon + (singleQuotedStr | doubleQuotedStr)
textModifierKeyOptionalValue = ast(
"textModifierKeyOptionalValue",
textModifierKey + pp.Optional(textModifierValue),
)
modifier = (
pp.Suppress("[")
+ pp.DelimitedList(textModifierKeyOptionalValue + optWs, delim=semicolon)
+ pp.Suppress("]")
)
textText = parenthesized(plainTextModifier)
textTextPhone = parenthesized(plainTextPhone)
textModifier = ast("textModifier", (textTextPhone | textText) + modifier)
urlSpecialChar = pp.Char(":/.-_~?#[]@!+,;%=()&")
url = ast("url", (digits | letters | urlSpecialChar)[1, ...])
audio = ast(
"audio",
pp.Suppress("!")
+ pp.Optional(parenthesized(pp.Optional(plainTextModifier)))
+ pp.Suppress("[")
+ (
pp.Suppress("'") + url + pp.Suppress("'")
| pp.Suppress('"') + url + pp.Suppress('"')
)
+ pp.Suppress("]"),
)
# Section
sectionModifierKey = ast(
"sectionModifierKey", textModifierKey
) # Same keywords!
sectionModifierText = ast(
"sectionModifierText", (digits | letters | hyphen)[1, ...]
)
sectionModifierValue = colon + (
pp.Suppress("'") + sectionModifierText + pp.Suppress("'")
| pp.Suppress('"') + sectionModifierText + pp.Suppress('"')
)
sectionModifierKeyOptionalValue = ast(
"sectionModifierKeyOptionalValue",
sectionModifierKey + pp.Optional(sectionModifierValue),
)
sectionModifier = (
pp.Suppress("[")
+ pp.DelimitedList(sectionModifierKeyOptionalValue + optWs, delim=semicolon)
+ pp.Suppress("]")
)
section = ast("section", pp.Suppress("#") + sectionModifier)
# Breaks
breakStrengthValue = ast(
"breakStrengthValue",
pp.Keyword("none")
| pp.Keyword("x-weak")
| pp.Keyword("weak")
| pp.Keyword("medium")
| pp.Keyword("strong")
| pp.Keyword("x-strong"),
)
breakValue = ast("breakValue", breakStrengthValue | time)
break_tag = ast(
"break",
pp.Suppress("[break:")
+ (
pp.Suppress("'") + breakValue + pp.Suppress("'")
| pp.Suppress('"') + breakValue + pp.Suppress('"')
)
+ pp.Suppress("]"),
)
markTag = ast(
"markTag",
pp.Suppress("[mark:")
+ (
pp.Suppress("'") + xsdToken + pp.Suppress("'")
| pp.Suppress('"') + xsdToken + pp.Suppress('"')
)
+ pp.Suppress("]"),
)
any_char = pp.Regex(r".", flags=re.DOTALL)
inline = ~(pp.LineEnd()) + (
bareIpa
| shortIpa
| shortSub
| textModifier
| emphasis
| shortBreak
| break_tag
| audio
| markTag
| plainTextSpecialChars
| plainText
| any_char
)
emptyLine = ast("emptyLine", pp.Regex(r"[ \t]*") + pp.LineEnd())
lineEnd = pp.LineEnd() | pp.StringEnd()
restOfLine = inline[0, ...] + lineEnd
simpleLine = ast("simpleLine", ~emptyLine + ~pp.StringEnd() + restOfLine)
paragraph = ast("paragraph", simpleLine[1, ...])
content = section | paragraph | emptyLine
document = ast("document", content[0, ...])
return document
[docs]
def parse(self, text: str) -> ASTNode:
"""
Parse raw Speech Markdown text to an AST.
Args:
text (str): Speech Markdown string.
Returns:
ASTNode: The root of the generated syntax tree.
Raises:
ValueError: If an unexpected syntax element fails during parse limit logic.
"""
try:
from typing import cast
parsed = self.grammar.parse_string(text, parse_all=True)[0]
if isinstance(parsed, ASTNode):
return parsed
return cast(ASTNode, parsed)
except pp.ParseException as e:
raise ValueError(f"Parse error: {e}")