Source code for hugiml.governance

# Copyright 2026 Srikumar Krishnamoorthy
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Governance artifacts for HUGIMLClassifierNative.

Provides model card generation, audit artifact packaging, and governance
metadata consistent with responsible model deployment practices and the
HUG-IML paper's emphasis on interpretability.
"""

from __future__ import annotations

import hashlib
import json
import logging
import os
import time
from dataclasses import asdict, dataclass, field
from typing import Any

logger = logging.getLogger(__name__)

__all__ = [
    "ModelCard",
    "AuditArtifact",
    "GovernanceMetadata",
    "generate_model_card",
    "package_audit_artifacts",
]


# =============================================================================
# Model card
# =============================================================================



[docs]
@dataclass
class ModelCard:
    """Structured model card for a fitted HUGIMLClassifierNative.

    Follows the Google Model Cards framework adapted for rule-based
    interpretable classifiers.

    Attributes
    ----------
    model_id : str
        Unique identifier for this model version.
    model_type : str
        Always 'HUGIMLClassifierNative'.
    paper_reference : str
        Citation for the HUG-IML algorithm.
    license : str
        Software license.
    intended_use : str
        Describe the intended classification task.
    out_of_scope_use : str
        Describe uses not covered by this model.
    training_data_description : str
        Description of training data.
    evaluation_data_description : str
        Description of evaluation data.
    hyperparameters : dict
        B, L, G, topK as used during training.
    performance_metrics : dict
        Accuracy, F1, AUC, ECE, Brier score, etc.
    n_patterns : int
        Number of mined HUG patterns.
    n_compound : int
        Number of compound patterns.
    top_patterns : list of str
        Most important patterns.
    limitations : list of str
        Known limitations.
    ethical_considerations : str
        Fairness, bias, and ethical notes.
    created_at : str
        ISO 8601 timestamp of creation.
    framework_version : str
        hugiml-core version.
    """

    model_id: str
    model_type: str = "HUGIMLClassifierNative"
    paper_reference: str = (
        "Krishnamoorthy, S. (2024). Interpretable Classifier Models for "
        "Decision Support Using High Utility Gain Patterns. "
        "IEEE Access, 12, 126088-126107. DOI: 10.1109/ACCESS.2024.3455563"
    )
    license: str = "Apache-2.0"
    intended_use: str = ""
    out_of_scope_use: str = ""
    training_data_description: str = ""
    evaluation_data_description: str = ""
    hyperparameters: dict[str, Any] = field(default_factory=dict)
    performance_metrics: dict[str, Any] = field(default_factory=dict)
    n_patterns: int = 0
    n_compound: int = 0
    top_patterns: list[str] = field(default_factory=list)
    limitations: list[str] = field(default_factory=list)
    ethical_considerations: str = ""
    created_at: str = field(
        default_factory=lambda: time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
    )
    framework_version: str = ""


[docs]
    def to_dict(self) -> dict:
        """Serialize to a plain dictionary."""
        return asdict(self)



[docs]
    def to_json(self, indent: int = 2) -> str:
        """Serialize to a JSON string."""
        return json.dumps(self.to_dict(), indent=indent, default=str)



[docs]
    def to_markdown(self) -> str:
        """Render the model card as a Markdown document."""
        lines = [
            f"# Model Card: {self.model_id}",
            "",
            f"**Type:** {self.model_type}  ",
            f"**License:** {self.license}  ",
            f"**Created:** {self.created_at}  ",
            f"**Framework:** hugiml-core {self.framework_version}",
            "",
            "## Reference",
            "",
            self.paper_reference,
            "",
            "## Intended Use",
            "",
            self.intended_use or "_Not specified._",
            "",
            "## Out-of-Scope Use",
            "",
            self.out_of_scope_use or "_Not specified._",
            "",
            "## Training Data",
            "",
            self.training_data_description or "_Not specified._",
            "",
            "## Evaluation Data",
            "",
            self.evaluation_data_description or "_Not specified._",
            "",
            "## Hyperparameters",
            "",
        ]
        for k, v in self.hyperparameters.items():
            lines.append(f"- **{k}**: {v}")
        lines += [
            "",
            "## Performance Metrics",
            "",
        ]
        for k, v in self.performance_metrics.items():
            lines.append(f"- **{k}**: {v}")
        lines += [
            "",
            "## Patterns",
            "",
            f"- Total patterns: {self.n_patterns}",
            f"- Compound patterns: {self.n_compound}",
            "",
            "### Top Patterns",
            "",
        ]
        for p in self.top_patterns[:15]:
            lines.append(f"- `{p}`")
        lines += [
            "",
            "## Limitations",
            "",
        ]
        for lim in self.limitations:
            lines.append(f"- {lim}")
        lines += [
            "",
            "## Ethical Considerations",
            "",
            self.ethical_considerations or "_Not specified._",
        ]
        return "\n".join(lines)



[docs]
    def save(self, path: str, fmt: str = "json") -> None:
        """Save the model card to a file.

        Parameters
        ----------
        path : str
            Output file path.
        fmt : {'json', 'markdown', 'md'}
            Output format.
        """
        if fmt in ("markdown", "md"):
            content = self.to_markdown()
        else:
            content = self.to_json()
        with open(path, "w", encoding="utf-8") as fh:
            fh.write(content)





[docs]
@dataclass
class GovernanceMetadata:
    """Minimal governance metadata attached to a model instance.

    Attributes
    ----------
    model_id : str
    owner : str
        Person or team responsible for this model.
    purpose : str
        Business or scientific purpose.
    data_classification : str
        Sensitivity of training data (e.g. 'public', 'internal', 'confidential').
    review_status : str
        One of 'draft', 'reviewed', 'approved', 'deprecated'.
    approved_by : str or None
    approved_at : str or None
    tags : list of str
    """

    model_id: str
    owner: str = ""
    purpose: str = ""
    data_classification: str = "unclassified"
    review_status: str = "draft"
    approved_by: str | None = None
    approved_at: str | None = None
    tags: list[str] = field(default_factory=list)


[docs]
    def to_dict(self) -> dict:
        """Return governance metadata as a plain dictionary."""
        return asdict(self)



[docs]
    def to_json(self, indent: int = 2) -> str:
        """Serialise governance metadata to a JSON string."""
        return json.dumps(self.to_dict(), indent=indent, default=str)




# =============================================================================
# Audit artifact
# =============================================================================



[docs]
@dataclass
class AuditArtifact:
    """Audit record for a model training run.

    Captures all information needed for regulatory review or internal audit.
    """

    model_id: str
    created_at: str = field(
        default_factory=lambda: time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
    )
    training_hash: str = ""
    model_card: dict[str, Any] | None = None
    governance: dict[str, Any] | None = None
    fit_metadata: dict[str, Any] | None = None
    pattern_info: list[dict[str, Any]] | None = None
    calibration: dict[str, Any] | None = None
    explainability: dict[str, Any] | None = None
    framework_version: str = ""


[docs]
    def to_dict(self) -> dict:
        """Return audit artifact fields as a plain dictionary."""
        return asdict(self)



[docs]
    def to_json(self, indent: int = 2) -> str:
        """Serialise the audit artifact to a JSON string."""
        return json.dumps(self.to_dict(), indent=indent, default=str)



[docs]
    def save(self, path: str) -> None:
        """Write the audit artifact to a JSON file."""
        with open(path, "w", encoding="utf-8") as fh:
            fh.write(self.to_json())




# =============================================================================
# Factory functions
# =============================================================================



[docs]
def generate_model_card(
    classifier: Any,
    model_id: str,
    *,
    intended_use: str = "",
    out_of_scope_use: str = "",
    training_data_description: str = "",
    evaluation_data_description: str = "",
    performance_metrics: dict[str, Any] | None = None,
    limitations: list[str] | None = None,
    ethical_considerations: str = "",
) -> ModelCard:
    """Populate a ModelCard from a fitted classifier.

    Parameters
    ----------
    classifier : HUGIMLClassifierNative
        A fitted classifier.
    model_id : str
        Unique identifier.

    Returns
    -------
    ModelCard
    """
    from hugiml import __version__

    meta = getattr(classifier, "fit_metadata_", None)
    hparams: dict[str, object] = {}
    n_patterns = 0
    n_compound = 0
    top_patterns: list[str] = []

    if meta is not None:
        hparams = getattr(meta, "config", {})
        n_patterns = getattr(meta, "n_patterns", 0)
        n_compound = getattr(meta, "n_compound", 0)

    try:
        feat_imp = classifier.feature_importances().head(15)
        top_patterns = feat_imp["pattern"].tolist()
    except Exception:
        logger.debug(
            "feature_importances() unavailable for model card; trying get_hug_features().",
            exc_info=True,
        )
        try:
            top_patterns = classifier.get_hug_features()[:15]
        except Exception:
            logger.debug("get_hug_features() also unavailable for model card.", exc_info=True)

    default_limitations = [
        "Patterns are derived from training data; distribution shift may reduce accuracy.",
        "Auto-binning may not be optimal for all numerical feature distributions.",
        "Compound pattern mining (L > 1) increases interpretability but may reduce coverage.",
        "Downstream logistic regression may underfit when the pattern matrix is sparse.",
    ]

    return ModelCard(
        model_id=model_id,
        intended_use=intended_use,
        out_of_scope_use=out_of_scope_use,
        training_data_description=training_data_description,
        evaluation_data_description=evaluation_data_description,
        hyperparameters=hparams,
        performance_metrics=performance_metrics or {},
        n_patterns=n_patterns,
        n_compound=n_compound,
        top_patterns=top_patterns,
        limitations=limitations or default_limitations,
        ethical_considerations=ethical_considerations,
        framework_version=__version__,
    )




[docs]
def package_audit_artifacts(
    classifier: Any,
    model_id: str,
    output_dir: str,
    *,
    model_card: ModelCard | None = None,
    governance: GovernanceMetadata | None = None,
    calibration_result: Any | None = None,
    explainability_report: Any | None = None,
) -> str:
    """Package all audit artifacts for a trained model.

    Writes model card, governance metadata, fit metadata, pattern info,
    and optional calibration/explainability reports to ``output_dir``.

    Returns
    -------
    str
        Path to the audit manifest JSON file.
    """
    from hugiml import __version__

    os.makedirs(output_dir, exist_ok=True)

    # Model card
    if model_card is None:
        model_card = generate_model_card(classifier, model_id)
    model_card.save(os.path.join(output_dir, "model_card.json"))
    model_card.save(os.path.join(output_dir, "model_card.md"), fmt="md")

    # Governance
    gov_dict = governance.to_dict() if governance else {"model_id": model_id}

    # Fit metadata
    meta = getattr(classifier, "fit_metadata_", None)
    meta_dict: dict[str, Any] = {}
    if meta is not None:
        for attr in [
            "n_samples",
            "n_features",
            "n_classes",
            "n_items",
            "n_patterns",
            "n_compound",
            "topK_used",
            "stage_times_ms",
            "total_fit_ms",
            "matrix_density",
            "config",
            "memory_peak_mb",
            "memory_rss_mb",
            "openmp_threads",
            "degraded",
        ]:
            val = getattr(meta, attr, None)
            if val is not None:
                meta_dict[attr] = val

    # Pattern info
    pattern_info_list: list[dict[str, Any]] = []
    try:
        df = classifier.get_pattern_info()
        pattern_info_list = df.to_dict(orient="records")
    except Exception:
        logger.warning(
            "get_pattern_info() failed; pattern_info will be empty in audit artifact.",
            exc_info=True,
        )

    # Calibration
    cal_dict = None
    if calibration_result is not None:
        try:
            cal_dict = calibration_result.to_dict()
        except Exception:
            logger.debug("calibration_result.to_dict() failed.", exc_info=True)

    # Explainability
    expl_dict = None
    if explainability_report is not None:
        try:
            expl_dict = json.loads(explainability_report.to_json())
        except Exception:
            logger.debug("explainability_report.to_json() failed.", exc_info=True)

    # Compute a hash of the pattern labels for provenance
    try:
        pattern_labels = classifier.get_hug_features()
        label_str = json.dumps(sorted(pattern_labels), sort_keys=True)
        training_hash = hashlib.sha256(label_str.encode()).hexdigest()[:16]
    except Exception:
        logger.warning(
            "Training hash computation failed; provenance will be 'unavailable'.", exc_info=True
        )
        training_hash = "unavailable"

    artifact = AuditArtifact(
        model_id=model_id,
        training_hash=training_hash,
        model_card=model_card.to_dict(),
        governance=gov_dict,
        fit_metadata=meta_dict,
        pattern_info=pattern_info_list,
        calibration=cal_dict,
        explainability=expl_dict,
        framework_version=__version__,
    )

    manifest_path = os.path.join(output_dir, "audit_manifest.json")
    artifact.save(manifest_path)
    return manifest_path