arch-design-agent-skill-das.../backend/app/modules/scanner/infrastructure/parsers/md_parser.py
2026-03-23 16:25:15 +00:00

161 lines
4.7 KiB
Python

"""Markdown parser — extracts YAML frontmatter and produces DesignDocument + specialized entities."""
from __future__ import annotations
import re
from pathlib import Path
from typing import Any
import yaml
from app.modules.design.domain.entities import (
ADR,
DesignDocument,
Domain,
ModuleBoundaryRule,
OperationalBaseline,
ReleasePlan,
RuntimeTopology,
ScopeAndGoals,
SolutionLayer,
SystemContext,
)
_FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.DOTALL)
class MdParser:
"""Parse Markdown file and return dict mapping entity type name to list of instances.
Keys: 'design_documents', 'scope_and_goals', 'system_context', etc.
"""
def parse(self, file_path: Path) -> dict[str, list[Any]]:
try:
content = file_path.read_text(encoding="utf-8")
except Exception:
return {}
match = _FRONTMATTER_RE.match(content)
if not match:
return {}
try:
frontmatter = yaml.safe_load(match.group(1))
except Exception:
return {}
if not isinstance(frontmatter, dict):
return {}
doc_id = frontmatter.get("doc_id", "")
if not doc_id:
return {}
title = frontmatter.get("title", "")
version = frontmatter.get("version", "")
status = frontmatter.get("status", "")
owners = frontmatter.get("owners", []) or []
upstream = frontmatter.get("upstream", []) or []
downstream = frontmatter.get("downstream", []) or []
# Ensure list types
if not isinstance(owners, list):
owners = [owners]
if not isinstance(upstream, list):
upstream = [upstream]
if not isinstance(downstream, list):
downstream = [downstream]
design_doc = DesignDocument(
doc_id=doc_id,
title=title,
version=str(version),
status=status,
owners=owners,
upstream=upstream,
downstream=downstream,
file_path=str(file_path),
)
result: dict[str, list[Any]] = {"design_documents": [design_doc]}
# Body content after frontmatter
body = content[match.end():].strip()
fname = file_path.name.lower()
fpath_str = str(file_path).lower()
# Specialized entity detection
if "scope-and-goals" in fname or "scope_and_goals" in fname:
result["scope_and_goals"] = [ScopeAndGoals(
doc_id=doc_id,
title=title,
core_problem="",
users="",
constraints="",
)]
elif "system-context" in fname or "system_context" in fname:
result["system_context"] = [SystemContext(
doc_id=doc_id,
title=title,
content=body,
)]
elif "solution-layering" in fname or "solution_layering" in fname:
result["solution_layer"] = [SolutionLayer(
doc_id=doc_id,
title=title,
content=body,
)]
elif "module-boundary" in fname or "module_boundary" in fname:
result["module_boundary_rule"] = [ModuleBoundaryRule(
doc_id=doc_id,
title=title,
content=body,
)]
elif "runtime-topology" in fname or "runtime_topology" in fname:
result["runtime_topology"] = [RuntimeTopology(
doc_id=doc_id,
title=title,
content=body,
)]
elif "operational-baseline" in fname or "operational_baseline" in fname:
result["operational_baseline"] = [OperationalBaseline(
doc_id=doc_id,
title=title,
content=body,
)]
elif "release-and-rollback" in fname or "release_and_rollback" in fname:
result["release_plan"] = [ReleasePlan(
doc_id=doc_id,
title=title,
content=body,
)]
elif "domain-overview" in fname or "domain_overview" in fname:
# Extract domain name from parent directory
domain_name = file_path.parent.name
result["domains"] = [Domain(
domain_name=domain_name,
overview=body,
modules=[],
entities=[],
)]
elif fname.startswith("adr-") and "template" not in fname.lower():
result["adrs"] = [ADR(
adr_id=doc_id,
title=title,
status=status,
context=body,
decision="",
)]
return result